xml: Introduce SAXophoNit, a SAX processor in Nit.
authorJean-Christophe Beaupré <jcbrinfo@users.noreply.github.com>
Thu, 9 Oct 2014 14:53:46 +0000 (10:53 -0400)
committerJean-Christophe Beaupré <jcbrinfo@users.noreply.github.com>
Mon, 27 Oct 2014 17:50:02 +0000 (13:50 -0400)
For the moment, this implementation is mostly non-compliant, but it
works with most common XML documents.

Signed-off-by: Jean-Christophe Beaupré <jcbrinfo@users.noreply.github.com>

lib/saxophonit/lexer.nit [new file with mode: 0644]
lib/saxophonit/reader_model.nit [new file with mode: 0644]
lib/saxophonit/saxophonit.nit

diff --git a/lib/saxophonit/lexer.nit b/lib/saxophonit/lexer.nit
new file mode 100644 (file)
index 0000000..4f77ab8
--- /dev/null
@@ -0,0 +1,392 @@
+# This file is part of NIT ( http://www.nitlanguage.org ).
+#
+# This file is free software, which comes along with NIT. This software is
+# distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+# without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE. You can modify it is you want, provided this header
+# is kept unaltered, and a notification of the changes is added.
+# You are allowed to redistribute it and sell it, alone or is a part of
+# another product.
+
+# SAXophoNit’s lexer
+module saxophonit::lexer
+
+import reader_model
+
+# SAXophoNit’s lexer
+#
+# Except when noted otherwise, `accept` and `expect` functions return `true` on
+# success and `false` on mismatch and at the end of the file.
+# They both foward the cursor to the next byte on success, but only `expect`
+# functions fire a fatal error on mismatch.
+class XophonLexer
+       var reader_model: XophonReaderModel
+       var input: IStream is writable
+       private var locator: SAXLocatorImpl is noinit
+
+       init do
+               locator = reader_model.locator.as(not null)
+       end
+
+       # Last read byte.
+       #
+       # Equals `-1` on end of file or error.
+       private var last_char: Int = -1
+
+       # Before end-of-line handling, was the last read byte a CARRIAGE RETURN?
+       private var was_cr: Bool = false
+
+
+       # Expect a value delimiter (`"` or `'`).
+       #
+       # If the last read byte is a delimiter, return the delimiter and
+       # read the next byte. Else, return `-1`.
+       fun expect_delimiter: Int do
+               if accept('"') then
+                       return '"'.ascii
+               else if accept('\'') then
+                       return '\''.ascii
+               else
+                       fire_unexpected_char(". Expecting `\"` or `'`")
+                       return -1
+               end
+       end
+
+       # Is the last read byte matches the `Char` production?
+       fun is_xml_char:Bool do
+               # TODO: Handle code points above 0x7F.
+               return last_char >= 32 or
+                               last_char == 9 or
+                               last_char == 10
+       end
+
+       # Push the last read byte in the specified buffer and read the next byte.
+       #
+       # If the last read byte is forbidden, fire a fatal error instead.
+       fun expect_xml_char(buffer: Buffer): Bool do
+               if is_xml_char then
+                       buffer.chars.push(last_char.ascii)
+                       read_char
+                       return true
+               else if eof then
+                       return fire_fatal_error("Unexpected end of file.")
+               else
+                       return fire_fatal_error("Forbidden character.")
+               end
+       end
+
+
+       # Like `expect_xml_char`, but normalize white space and forbid `<`.
+       #
+       # SEE: The “3.3.3 Attribute-Value Normalization” section of any XML
+       # recommendation.
+       fun expect_att_value_char(buffer: Buffer): Bool do
+               if is_s then
+                       buffer.chars.push(' ')
+                       read_char
+                       return true
+               else if last_char == '<'.ascii then
+                       return fire_fatal_error("`<` is forbidden in attribute values.")
+               else
+                       return expect_xml_char(buffer)
+               end
+       end
+
+       # Is the last read byte matches the `S` production?
+       fun is_s:Bool do
+               return last_char == 32 or last_char == 9 or last_char == 10
+       end
+
+       # Skip a `S?` token and return `true`.
+       fun skip_s: Bool do
+               while is_s do read_char
+               return true
+       end
+
+       # Accept a `S` token.
+       fun accept_s: Bool do
+               if is_s then
+                       read_char
+                       return skip_s
+               else
+                       return false
+               end
+       end
+
+       # Expect `S`.
+       fun expect_s: Bool do
+               return (accept_s and skip_s) or fire_unexpected_char(". Expecting white space")
+       end
+
+       # Is the last read byte matches the `NameStartChar` production?
+       fun is_name_start_char: Bool do
+               # TODO: Handle code points above 0x7F.
+               return ['A'.ascii .. 'Z'.ascii].has(last_char) or
+                               ['a'.ascii .. 'z'.ascii].has(last_char) or
+                               last_char == '_'.ascii or
+                               last_char == ':'.ascii or
+                               last_char > 127
+       end
+
+       # Is the last read byte matches the `NameChar` production?
+       fun is_name_char: Bool do
+               # TODO: Handle code points above 0x7F.
+               return is_name_start_char or
+                               last_char == '-'.ascii or
+                               last_char == '.'.ascii or
+                               is_digit
+       end
+
+       # Expect a `Name` tokn.
+       #
+       # Append the parsed name to `buffer`.
+       fun expect_name(buffer: Buffer): Bool do
+               if not is_name_start_char then
+                       return fire_unexpected_char(" at the beginning of a name")
+               end
+               buffer.chars.push(last_char.ascii)
+               read_char
+               while is_name_char do
+                       buffer.chars.push(last_char.ascii)
+                       read_char
+               end
+               return true
+       end
+
+       # Expect a `PITarget` token.
+       #
+       # Append the parsed name to `buffer`.
+       fun expect_pi_target(buffer: Buffer): Bool do
+               return expect_name(buffer) and check_pi_target(buffer)
+       end
+
+       # Ensure the target is not `xml` (case-insensitive).
+       #
+       # Also, fire an error if the target contains a colon.
+       fun check_pi_target(target: Text): Bool do
+               var is_invalid = target.length == 3 and
+                               (target.chars[0] == 'X' or target.chars[0] == 'x') and
+                               (target.chars[0] == 'M' or target.chars[0] == 'm') and
+                               (target.chars[0] == 'L' or target.chars[0] == 'l')
+
+               if is_invalid then
+                       return fire_fatal_error("Forbidden processing target `{target}`.")
+               else
+                       if target.has(":") then
+                               reader_model.fire_error("The processing target `{target}` contains a colon.", null)
+                       end
+                       return true
+               end
+       end
+
+       # Is the last read byte matches the `[0-9]` production?
+       fun is_digit: Bool do
+               return ['0'.ascii .. '9'.ascii].has(last_char)
+       end
+
+       # Accept a `[0-9]+` token.
+       fun accept_digits(buffer: Buffer): Bool do
+               if is_digit then
+                       loop
+                               buffer.chars.push(last_char.ascii)
+                               read_char
+                               if not is_digit then return true
+                       end
+               else
+                       return false
+               end
+       end
+
+       # Expect a `[0-9]+` token.
+       fun expect_digits(buffer: Buffer): Bool do
+               return accept_digits(buffer) or fire_unexpected_char(". Expecting a decimal digit")
+       end
+
+       # Is `last_char` matches the `[0-9a-fA-F]` production?
+       fun is_hex: Bool do
+               return ['0'.ascii .. '9'.ascii].has(last_char) or
+                               ['A'.ascii .. 'Z'.ascii].has(last_char) or
+                               ['a'.ascii .. 'Z'.ascii].has(last_char)
+       end
+
+       # Expect a `[0-9a-fA-F]+` token.
+       fun expect_hex(buffer: Buffer): Bool do
+               if is_hex then
+                       loop
+                               buffer.chars.push(last_char.ascii)
+                               read_char
+                               if not is_hex then return true
+                       end
+               else
+                       return fire_unexpected_char(". Expecting an hexadecimal digit")
+               end
+       end
+
+       # Expect `Eq`.
+       fun expect_eq: Bool do
+               return skip_s and expect('=', "") and skip_s
+       end
+
+
+       ############################################################################
+       # General
+
+       # Read a byte and put it in `last_char`.
+       #
+       # In case of an end-of-file or an error, put -1 in `last_char`.
+       private fun read_char do
+               if locator.line_number < 0 then
+                       locator.line_number = 1
+                       locator.column_number = 1
+               else if last_char < 0 then
+                       fire_fatal_error("Internal error: Already at the end of the file.")
+                       return
+               else if last_char == '\n'.ascii then
+                       locator.line_number += 1
+                       locator.column_number = 1
+               else
+                       locator.column_number += 1
+               end
+
+               last_char = input.read_char
+               if last_char < 0 then
+                       return
+               end
+
+               # XML 1.0 end-of-line handling
+               # Note: Regardless the XML version, any EOL defined by the
+               # recommandation MUST be reported as a single LINE FEED.
+               if was_cr and last_char == '\n'.ascii then
+                       # EOL already reported. => Skip this byte.
+                       last_char = input.read_char
+               end
+               was_cr = last_char == '\r'.ascii
+               if was_cr then
+                       # Regardless the following byte, '\r' always introduce an EOL.
+                       last_char = '\n'.ascii
+               end
+       end
+
+       # Is it the end of the stream?
+       #
+       # Also return `true` after a fatal error.
+       fun eof: Bool do return last_char < 0
+
+       # Start the lexer.
+       fun start do
+               if eof then
+                       last_char = 0
+                       read_char
+               end
+       end
+
+       # Close the input.
+       fun close do
+               last_char = -1
+               input.close
+       end
+
+       # Does the last read byte equal `c`?
+       fun is_int(c: Int): Bool do return last_char == c
+
+       # Does the last read byte equal `c`?
+       fun is_char(c: Char): Bool do return last_char == c.ascii
+
+       # Expect the specified byte.
+       fun accept_int(expected: Int): Bool do
+               if last_char == expected then
+                       read_char
+                       return true
+               else
+                       return false
+               end
+       end
+
+       # Accept the specified byte.
+       fun accept(expected: Char): Bool do
+               return accept_int(expected.ascii)
+       end
+
+       # Ensure the last read byte is equal to `expected`.
+       #
+       # If it is, read the next byte. If not, fire a fatal error using
+       # `context`. `context` is the part of the message that gives the context.
+       # For example, in `Unexpected ``x`` in y. Expecting ``z``.`, the value of
+       # `context` is `" in y"`.
+       #
+       # Return `true` if and only if the last read byte as the expected value.
+       fun expect_int(expected: Int, context: String): Bool do
+               return accept_int(expected) or
+                               fire_unexpected_char("{context}. Expecting `{expected.ascii}`.")
+       end
+
+       # Ensure the last read byte is equal to `expected`.
+       #
+       # If it is, read the next byte. If not, fire a fatal error using
+       # `context`. `context` is the part of the message that gives the context.
+       # For example, in `Unexpected ``x`` in y. Expecting ``z``.`, the value of
+       # `context` is `" in y"`.
+       #
+       # Return `true` if and only if the last read byte as the expected value.
+       fun expect(expected: Char, context: String): Bool do
+               return accept(expected) or
+                               fire_unexpected_char("{context}. Expecting `{expected}`.")
+       end
+
+       # Ensure the last read byte and following bytes match `expected`.
+       #
+       # If it is, read one more byte. If not, fire a fatal error using
+       # `context`. `context` is the part of the message that gives the context.
+       # For example, in `Unexpected ``x`` in y. Expecting ``z``.`, the value of
+       # `context` is `" in y"`.
+       #
+       # Return `true` if and only if the last read byte and following bytes
+       # match `expected`.
+       fun expect_string(expected: String, context: String): Bool do
+               var chars = expected.chars
+               var i: Int = 0
+
+               while i < chars.length do
+                       if not accept(chars[i]) then
+                               if is_xml_char then
+                                       return fire_fatal_error("Unexpected " +
+                                                       "`{expected.substring(0, i)}{last_char.ascii.to_s}`" +
+                                                       "{context}. Expecting `{expected}`.")
+                               else if eof then
+                                       return fire_fatal_error("Unexpected end of file{context}. " +
+                                                       "Expecting `{expected}`.")
+                               else
+                                       return fire_fatal_error("Forbidden character.")
+                               end
+                       end
+                       i += 1
+               end
+               return true
+       end
+
+
+       ############################################################################
+       # Dispatching
+
+       # Fire a fatal error about an unexpected character.
+       #
+       # Return `false`.
+       fun fire_unexpected_char(rest_of_message: String): Bool do
+               if is_xml_char then
+                       return fire_fatal_error("Unexpected character `{last_char.ascii.to_s}`{rest_of_message}.")
+               else if eof then
+                       return fire_fatal_error("Unexpected end of file{rest_of_message}.")
+               else
+                       return fire_fatal_error("Forbidden character.")
+               end
+       end
+
+       # Fire a fatal error with the specified message.
+       #
+       # Return `false`.
+       private fun fire_fatal_error(message: String): Bool do
+               reader_model.fire_fatal_error(message, null)
+               last_char = -1
+               return false
+       end
+end
diff --git a/lib/saxophonit/reader_model.nit b/lib/saxophonit/reader_model.nit
new file mode 100644 (file)
index 0000000..ff2299a
--- /dev/null
@@ -0,0 +1,357 @@
+# This file is part of NIT ( http://www.nitlanguage.org ).
+#
+# This file is free software, which comes along with NIT. This software is
+# distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+# without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE. You can modify it is you want, provided this header
+# is kept unaltered, and a notification of the changes is added.
+# You are allowed to redistribute it and sell it, alone or is a part of
+# another product.
+
+# Reader’s model.
+module saxophonit::reader_model
+
+import sax
+import sax::helpers::sax_locator_impl
+import sax::helpers::attributes_impl
+import sax::helpers::namespace_support
+
+# Reader’s model.
+#
+# Handle event dispatching, settings and internal state.
+class XophonReaderModel
+
+       # Stack of the current element type qname and of the qnames of the ancestors.
+       #
+       # Current element is at the peek, the root element is at the bottom.
+       # Used to check if end tags match start tags.
+       private var element_path = new Array[XmlName]
+
+       # Current element’s attributes
+       private var atts = new AttributesImpl
+
+       private var ns = new NamespaceSupport
+
+       # Regular expression to match a `Name` against the `QName` production of
+       # “Namespaces in XML”.
+       private var qname_re: Regex = "^[^:]+(:[^:]+)?$".to_re
+
+       var locator: nullable SAXLocatorImpl = null is writable
+
+
+       # TODO: Handle these features.
+
+       private var features: Map[String, Bool] = new HashMap[String, Bool]
+
+       # SEE: `sax`
+       var feature_namespaces_uri =
+                       "http://xml.org/sax/features/namespaces"
+
+       # SEE: `sax`
+       var feature_namespace_prefixes_uri =
+                       "http://xml.org/sax/features/namespace-prefixes"
+
+
+       # SEE: `sax::XMLReader.entity_resolver`
+       var entity_resolver: nullable EntityResolver = null is writable
+
+       # SEE: `sax::XMLReader.dtd_handler`
+       var dtd_handler: nullable DTDHandler = null is writable
+
+       # SEE: `sax::XMLReader.content_handler`
+       var content_handler: nullable ContentHandler = null is writable
+
+       # SEE: `sax::XMLReader.error_handler`
+       var error_handler: nullable ErrorHandler = null is writable
+
+
+       init do
+               qname_re.optimize_is_in = true
+               features[feature_namespaces_uri] = true
+               features[feature_namespace_prefixes_uri] = false
+       end
+
+       # SEE: `sax::XMLReader.feature_recognized`
+       fun feature_recognized(name: String): Bool do
+               return features.keys.has(name)
+       end
+
+       # SEE: `sax::XMLReader.feature_readable`
+       fun feature_readable(name: String): Bool do
+               return features.keys.has(name)
+       end
+
+       # SEE: `sax::XMLReader.feature_writable`
+       fun feature_writable(name: String): Bool do
+               return features.keys.has(name)
+       end
+
+       # SEE: `sax::XMLReader.feature`
+       fun feature(name: String): Bool do
+               assert feature_recognized: feature_recognized(name)
+               return features[name]
+       end
+
+       # SEE: `sax::XMLReader.feature=`
+       fun feature=(name: String, value: Bool) do
+               assert feature_recognized: feature_recognized(name)
+               if name == feature_namespaces_uri then
+                       assert legal_state: value or features[feature_namespace_prefixes_uri] else
+                               sys.stderr.write("At least one of <{feature_namespaces_uri}> or <{feature_namespace_prefixes_uri}> must be true.\n")
+                       end
+               else if name == feature_namespace_prefixes_uri then
+                       assert legal_state: value or features[feature_namespaces_uri] else
+                               sys.stderr.write("At least one of <{feature_namespaces_uri}> or <{feature_namespace_prefixes_uri}> must be true.\n")
+                       end
+               end
+               features[name] = value
+       end
+
+       # SEE: `sax::XMLReader.property_recognized`
+       fun property_recognized(name: String): Bool do return false
+
+       # SEE: `sax::XMLReader.property_readable`
+       fun property_readable(name: String): Bool do return false
+
+       # SEE: `sax::XMLReader.property_writable`
+       fun property_writable(name: String): Bool do return false
+
+       # SEE: `sax::XMLReader.property`
+       fun property(name: String): nullable Object do
+               assert property_recognized: false
+               return null
+       end
+
+       # SEE: `sax::XMLReader.property=`
+       fun property=(name: String, value: nullable Object) do
+               assert property_recognized: false
+       end
+
+       # Is the root element closed?
+       fun root_closed: Bool do return element_path.length <= 0
+
+       # Expect the root element is closed.
+       fun expect_root_closed: Bool do
+               if root_closed then
+                       return true
+               else if element_path.length > 1 then
+                       return fire_fatal_error("Reached the end of the file with " +
+                                       "{element_path.length.to_s} open elements.", null)
+               else
+                       return fire_fatal_error("Reached the end of the file with an " +
+                                       "open element.", null)
+               end
+       end
+
+
+       ###########################################################################
+       # Dispatching
+
+       # Set the document locator of the content handler, if needed.
+       fun fire_document_locator do
+               if content_handler != null then
+                       content_handler.document_locator = locator.as(not null)
+               end
+       end
+
+       # Fire the start of the document.
+       fun fire_start_document do
+               if content_handler != null then
+                       content_handler.start_document
+               end
+               ns.reset
+       end
+
+       # Fire the end of the document.
+       fun fire_end_document do
+               if content_handler != null then
+                       content_handler.end_document
+               end
+       end
+
+       # Fire the start of an attribute list.
+       fun fire_start_attributes do
+               atts.clear
+               ns.push_context
+       end
+
+       # Fire the appearance of an attribute.
+       fun fire_attribute(qname: String, value: String) do
+               if "xmlns" == qname or qname.has_prefix("xmlns:") then
+                       var prefix = qname.substring_from("xmlns:".length)
+
+                       if not prefix.has(":") then
+                               fire_start_prefix_mapping(prefix, value)
+                               if not feature(feature_namespace_prefixes_uri) then return
+                       end
+               end
+               # TODO: Types.
+               atts.add("", "", qname, "CDATA", value)
+       end
+
+       # Fire the start of an element.
+       fun fire_start_element(name: String) do
+               var parts = ["", "", ""]
+
+               for i in [0..atts.length[ do
+                       set_attribute_ns(i)
+               end
+               process_name(name, parts, false)
+               element_path.push(new XmlName(parts[0], parts[1], parts[2]))
+               if content_handler != null then
+                       content_handler.start_element(parts[0], parts[1], parts[2], atts)
+               end
+       end
+
+       # Now prefixes are mapped, set the expanded name of the attribute at `index`.
+       private fun set_attribute_ns(index: Int) do
+               var name = ["", "", ""]
+
+               process_name(atts.qname(index).as(not null), name, true)
+               atts.uri(index) = name[0]
+               atts.local_name(index) = name[1]
+       end
+
+       # Like `ns.process_name`, but with error handling.
+       private fun process_name(qname: String, parts: Array[String],
+                       is_attribute: Bool) do
+               if qname.has(qname_re) then
+                       if ns.process_name(qname, parts, is_attribute) == null then
+                               fire_error("The namespace IRI of `{qname}` was not found in " +
+                                               "this scope. Passing the original name as the local " +
+                                               "name.", null)
+                               parts = ["", qname, qname]
+                       end
+               else
+                       fire_error("The name `{qname}` contains more than one colon. " +
+                                       "Passing the original name as the local name.", null)
+                       parts = ["", qname, qname]
+               end
+       end
+
+       # Fire the end of an element.
+       #
+       # Return `true` on success.
+       fun fire_end_element(name: String):Bool do
+               var peek_name = element_path.last
+
+               if peek_name.qname == name then
+                       element_path.pop
+                       if content_handler != null then
+                               content_handler.end_element(peek_name.uri,
+                                               peek_name.local_name, peek_name.qname)
+                       end
+                       return true
+               else
+                       fire_fatal_error("The type in the closing tag (`{name}`) does " +
+                                       "not match the type in the opening tag " +
+                                       "(`{element_path.last.qname}`).", null)
+                       return false
+               end
+       end
+
+       # Fire the start of a mapping between `prefix` and `uri`.
+       private fun fire_start_prefix_mapping(prefix: String, uri: String) do
+               if not ns.declare_prefix(prefix, uri) then
+                       fire_error("The mapping between the prefix `{prefix}` and " +
+                                       "the namespace IRI `{uri}` breaks a built-in " +
+                                       "mapping. Ignoring the declaration.", null)
+               end
+               if content_handler != null then
+                       content_handler.start_prefix_mapping(prefix, uri)
+               end
+       end
+
+       # Fire the end of the current mapping of `prefix`.
+       private fun end_prefix_mapping(prefix: String) do
+               if content_handler != null then
+                       content_handler.end_prefix_mapping(prefix)
+               end
+       end
+
+       # Fire the appearance of a comment.
+       fun fire_comment(content: String) do
+               # TODO
+       end
+
+       # Fire the appearance of a processing instruction.
+       fun fire_processing_instruction(target: String, data: nullable String) do
+               if content_handler != null then
+                       content_handler.processing_instruction(target, data)
+               end
+       end
+
+       # Fire the start of a `CDATA` section.
+       fun fire_start_cdata do
+               # TODO
+       end
+
+       # Fire the end of a `CDATA` section.
+       fun fire_end_cdata do
+               # TODO
+       end
+
+       # Fire the appearance of a text node.
+       fun fire_characters(str: String) do
+               if content_handler != null then
+                       content_handler.characters(str)
+               end
+       end
+
+       private fun exception(message: String, cause: nullable Error):
+                       SAXParseException do
+               var e: SAXParseException
+
+               if locator == null then
+                       e = new SAXParseException(message)
+               else
+                       e = new SAXParseException.with_locator(message, locator.as(not null))
+               end
+               e.cause = cause
+               return e
+       end
+
+       # Fire a fatal error with the specified message and cause.
+       #
+       # Return `false`.
+       fun fire_fatal_error(message: String, cause: nullable Error):Bool do
+               var e = exception(message, cause)
+
+               if error_handler == null then
+                       e.throw
+               else
+                       error_handler.fatal_error(e)
+               end
+               return false
+       end
+
+       # Fire an error with the specified message and cause.
+       fun fire_error(message: String, cause: nullable Error) do
+               var e = exception(message, cause)
+
+               if error_handler != null then
+                       error_handler.error(e)
+               end
+       end
+
+       # Fire a warning with the specified message and cause.
+       fun fire_warning(message: String, cause: nullable Error) do
+               var e = exception(message, cause)
+
+               if error_handler != null then
+                       error_handler.warning(e)
+               end
+       end
+end
+
+# An XML expanded name.
+private class XmlName
+       # Namespace IRI or `""`.
+       var uri: String
+
+       # Local name or `""`.
+       var local_name: String
+
+       # Original qualified name.
+       var qname: String
+end
index 22d4314..a225413 100644 (file)
 
 # A SAX 2 parser in Nit.
 module saxophonit
+
+import sax
+intrude import standard::file
+private import reader_model
+private import lexer
+
+# Implementation of the `XMLReader` interface.
+#
+# For the moment, only XML 1.0 is (partially) supported.
+#
+# The following mandatory features of XML 1.0 are not yet supported:
+#
+# * Parsing of entities (files) encoded in UTF-16.
+# * Encoding handling.
+# * Entity references resolving (except for built-in references).
+# * Handling of the options specified in the XML declaration.
+# * Parsing of a `DOCTYPE` declaration.
+#
+# Also note that this XML processor is unable to retrieve a file from an URL
+# (only local paths are supported).
+class XophonReader
+       super XMLReader
+
+       private var model = new XophonReaderModel
+       private var lexer: XophonLexer is noinit
+
+       redef fun entity_resolver: nullable EntityResolver do return model.entity_resolver
+       redef fun entity_resolver=(entity_resolver: nullable EntityResolver) do
+               model.entity_resolver = entity_resolver
+       end
+
+       redef fun dtd_handler: nullable DTDHandler do return model.dtd_handler
+       redef fun dtd_handler=(dtd_handler: nullable DTDHandler) do
+               model.dtd_handler = dtd_handler
+       end
+
+       redef fun content_handler: nullable ContentHandler do return model.content_handler
+       redef fun content_handler=(content_handler: nullable ContentHandler) do
+               model.content_handler = content_handler
+       end
+
+       redef fun error_handler: nullable ErrorHandler do return model.error_handler
+       redef fun error_handler=(error_handler: nullable ErrorHandler) do
+               model.error_handler = error_handler
+       end
+
+
+       redef fun feature_recognized(name: String): Bool do
+               return model.feature_recognized(name)
+       end
+
+       redef fun feature_readable(name: String): Bool do
+               return model.feature_readable(name)
+       end
+
+       redef fun feature_writable(name: String): Bool do
+               return model.feature_readable(name)
+       end
+
+       redef fun feature(name: String): Bool do return model.feature(name)
+       redef fun feature=(name: String, value: Bool) do model.feature(name) = value
+
+       redef fun property_recognized(name: String): Bool do
+               return model.property_recognized(name)
+       end
+
+       redef fun property_readable(name: String): Bool do
+               return model.property_readable(name)
+       end
+
+       redef fun property_writable(name: String): Bool do
+               return model.property_writable(name)
+       end
+
+       redef fun property(name: String): nullable Object do
+               return model.property(name)
+       end
+
+       redef fun property=(name: String, value: nullable Object) do
+               model.property(name) = value
+       end
+
+       redef fun parse(input: InputSource) do
+               var stream: IStream
+               var system_id: nullable MaybeError[String, Error] = null
+               model.locator = new SAXLocatorImpl
+
+               if input.system_id != null then
+                       system_id = resolve_system_id(input.system_id.as(not null))
+                       if system_id.is_error then
+                               model.fire_warning(system_id.error.message, system_id.error)
+                       else
+                               model.locator.system_id = system_id.value
+                       end
+               end
+               model.locator.public_id = input.public_id
+               # TODO: encoding
+
+               if input.stream != null then
+                       lexer = new XophonLexer(model, input.stream.as(not null))
+                       parse_main
+               else if system_id != null then
+                       if system_id.is_error then
+                               model.fire_fatal_error("File <{input.system_id.as(not null)}> not found.", null)
+                       else
+                               lexer = new XophonLexer(model,
+                                               new IFStream.open(system_id.value))
+                               parse_main
+                               lexer.close
+                       end
+               else
+                       model.fire_fatal_error("At least a stream or a system identifier must be specified. None given.",
+                                       null)
+               end
+       end
+
+       redef fun parse_file(system_id: String) do
+               parse(new InputSource.with_system_id(system_id))
+       end
+
+
+       ############################################################################
+       # Parsing
+
+       # Note: Every `expect_*` function (except `parse_main`) does not call
+       # `read_char` for the first byte and let the byte just after its production
+       # in `last_char` (except in case of fatal error). They return `false` on
+       # fatal error and at the end of the file.
+
+       # Parse the main entity.
+       private fun parse_main do
+               model.fire_document_locator
+               model.fire_start_document
+               lexer.start
+               expect_document
+               model.fire_end_document
+       end
+
+       # Expect a `document` production.
+       private fun expect_document: Bool do
+               var success = true
+               var got_doctype = false
+               var got_element = false
+
+               # If the document start with `<`, it may start with a XML declaration,
+               # a processing instruction, a comment, a `DOCTYPE` declaration, the
+               # root element or a white space.
+               if lexer.accept('<') then
+                       if lexer.accept('?') then
+                               if not expect_pi_or_xml_decl then return false
+                       else if lexer.accept('!') then
+                               if lexer.accept('-') then
+                                       if not lexer.expect('-',
+                                                               " at the beginning of a comment") or
+                                                               not expect_comment then
+                                               return false
+                                       end
+                               else
+                                       if not expect_doctype_decl then return false
+                                       got_doctype = true
+                               end
+                       else
+                               if not expect_root then return false
+                               # The `DOCTYPE` declaration *must* come before the root
+                               # element.
+                               got_doctype = true
+                               got_element = true
+                       end
+               else if not lexer.accept_s then
+                       return lexer.fire_unexpected_char(
+                                       ". Expecting a white space or `<`")
+               end
+
+               # After the XML declaration (if there is one), the document may contain
+               # processing instructions, comments, the `DOCTYPE` declaration and
+               # the root element.
+               # These productions may be separated by white space.
+               while not got_element do
+                       if lexer.accept('<') then
+                               if lexer.accept('?') then
+                                       if not expect_pi then return false
+                               else if lexer.accept('!') then
+                                       if lexer.accept('-') then
+                                               if not lexer.expect('-',
+                                                                       " at the beginning of a comment") or
+                                                                       not expect_comment then
+                                                       return false
+                                               end
+                                       else if got_doctype then
+                                               return lexer.fire_unexpected_char(". Expecting `-`")
+                                       else if expect_doctype_decl then
+                                               got_doctype = true
+                                       else
+                                               return false
+                                       end
+                               else
+                                       if not expect_root then return false
+                                       # The `DOCTYPE` declaration *must* come before the root
+                                       # element.
+                                       got_doctype = true
+                                       got_element = true
+                               end
+                       else if not lexer.accept_s then
+                               return lexer.fire_unexpected_char(
+                                               ". Expecting a white space or `<`")
+                       end
+               end
+               return expect_miscs
+       end
+
+       private fun expect_doctype_decl: Bool do
+               return model.fire_fatal_error("DTD not supported yet.\n", null) # TODO
+       end
+
+       # Expect the root `element` production, without the first `<` token.
+       private fun expect_root: Bool do
+               var success = true
+               var char_data = new FlatBuffer
+
+               success = expect_stag
+               while success and not lexer.eof and not model.root_closed do
+                       success = expect_content_chunk(char_data)
+               end
+               if success then
+                       success = model.expect_root_closed
+               end
+               flush(char_data)
+               return success
+       end
+
+       # Parse a `EmptyElemTag | STag | ETag | Reference | CDSect | PI | Comment | CharData` production.
+       #
+       # If the last read byte matches the `CharData` production, push the char in
+       # `char_data`. Else, flush `CharData` as a `characters` event.
+       private fun expect_content_chunk(char_data: Buffer): Bool do
+               if lexer.accept('<') then
+                       flush(char_data)
+                       if lexer.accept('!') then
+                               if lexer.accept('-') then
+                                       return lexer.expect('-',
+                                                       " at the beginning of a comment") and
+                                                       expect_comment
+                               else if lexer.accept('[') then
+                                       return expect_cd_sect
+                               else
+                                       return lexer.fire_unexpected_char(
+                                                       ". Expecting `--` or `[CDATA[`")
+                               end
+                       else if lexer.accept('?') then
+                               return expect_pi
+                       else if lexer.accept('/') then
+                               return expect_etag
+                       else
+                               return expect_stag
+                       end
+               else if lexer.accept('&') then
+                       flush(char_data)
+                       var success = expect_reference(char_data)
+                       flush(char_data)
+                       return success
+               else
+                       return lexer.expect_xml_char(char_data)
+               end
+       end
+
+       # Expect a `EmptyElemTag | STag` production, without the initial `<`.
+       private fun expect_stag: Bool do
+               var name_buffer = new FlatBuffer
+
+               if lexer.expect_name(name_buffer) then
+                       var name = name_buffer.to_s
+
+                       model.fire_start_attributes
+                       loop
+                               if lexer.accept('>') then
+                                       model.fire_start_element(name)
+                                       return true
+                               else if lexer.accept('/') then
+                                       if lexer.expect('>', "") then
+                                               model.fire_start_element(name)
+                                               model.fire_end_element(name)
+                                               return true
+                                       else
+                                               return false
+                                       end
+                               else if lexer.expect_s then
+                                       if lexer.accept('/') then
+                                               if lexer.expect('>', "") then
+                                                       model.fire_start_element(name)
+                                                       model.fire_end_element(name)
+                                                       return true
+                                               else
+                                                       return false
+                                               end
+                                       else if lexer.accept('>') then
+                                               model.fire_start_element(name)
+                                               return true
+                                       else if not expect_attribute then
+                                               return false
+                                       end
+                               else
+                                       return lexer.fire_unexpected_char(" in tag. " +
+                                                       "Expecting an attribute, `/`, `>` or white space")
+                               end
+                       end
+               end
+               return false
+       end
+
+       # Expect a `ETag` production, without the initial `</`.
+       private fun expect_etag: Bool do
+               var name_buf = new FlatBuffer
+
+               if lexer.expect_name(name_buf) and
+                               lexer.skip_s and
+                               lexer.expect('>', "") then
+                       return model.fire_end_element(name_buf.to_s)
+               else
+                       return false
+               end
+       end
+
+       # Expect an `Attributes` production.
+       private fun expect_attribute: Bool do
+               var name = new FlatBuffer
+               var value = new FlatBuffer
+
+               if lexer.expect_name(name) and
+                               lexer.expect_eq and
+                               expect_att_value(value) then
+                       model.fire_attribute(name.to_s, value.to_s)
+                       return true
+               else
+                       return false
+               end
+       end
+
+       # Expect the `Misc*` production at the end of a document.
+       private fun expect_miscs: Bool do
+               while not lexer.eof do
+                       if lexer.accept('<') then
+                               if lexer.accept('?') then
+                                       if not expect_pi then return false
+                               else if lexer.accept('!') then
+                                       if not lexer.expect_string("--",
+                                                       " at the beginning of a comment") or
+                                                       not expect_comment then
+                                               return false
+                                       end
+                               else
+                                       return lexer.fire_unexpected_char(". Expecting `?` or `!`")
+                               end
+                       else if not lexer.accept_s then
+                               return lexer.fire_unexpected_char(
+                                               ". Expecting a white space or `<`")
+                       end
+               end
+               return true
+       end
+
+       # Expect a `AttValue` production.
+       #
+       # Append the parsed value to `buffer`.
+       private fun expect_att_value(buffer: Buffer): Bool do
+               var delimiter = lexer.expect_delimiter
+
+               if delimiter < 0 then return false
+               loop
+                       if lexer.accept_int(delimiter) then
+                               return true
+                       else if lexer.accept('&') then
+                               # TODO: [WFC: No < in Attribute Values]
+                               if not expect_reference(buffer) then return false
+                       else if not lexer.expect_att_value_char(buffer) then
+                               return false
+                       end
+               end
+       end
+
+       # Expect a `SystemLiteral` production.
+       #
+       # Also used to parse productions that do not have references.
+       # Append the parsed value to `buffer`.
+       private fun expect_literal(buffer: Buffer): Bool do
+               var delimiter = lexer.expect_delimiter
+
+               if delimiter < 0 then return false
+               loop
+                       if lexer.accept_int(delimiter) then
+                               return true
+                       else if not lexer.expect_xml_char(buffer) then
+                               return false
+                       end
+               end
+       end
+
+
+       # Expect a `Comment` production, without the beginning.
+       #
+       # Assume `last_char` is the fifth byte of the production that is, the
+       # next byte after the `'<!--'` token.
+       private fun expect_comment: Bool do
+               var buffer: Buffer = new FlatBuffer
+
+               loop
+                       if lexer.accept('-') then
+                               if lexer.accept('-') then
+                                       if not lexer.expect('>',
+                                                       " after a double-hyphen (`--`) in a comment") then
+                                               return false
+                                       else
+                                               break
+                                       end
+                               else
+                                       buffer.chars.push('-')
+                                       if not lexer.expect_xml_char(buffer) then return false
+                               end
+                       else if not lexer.expect_xml_char(buffer) then
+                               return false
+                       end
+               end
+               model.fire_comment(buffer.to_s)
+               return true
+       end
+
+       # Expect a `PI` production, without the beginning.
+       #
+       # Assume `last_char` is the third byte of the production that is, the
+       # next byte after the `'<?'` token.
+       private fun expect_pi: Bool do
+               var target = new FlatBuffer
+
+               return lexer.expect_pi_target(target) and
+                               expect_pi_data(target.to_s)
+       end
+
+       # Expect the data part and the `'?>'` token of a `PI` production.
+       private fun expect_pi_data(target: String): Bool do
+               if lexer.accept('?') then
+                       if lexer.expect('>', " at the end of a processing instruction") then
+                               model.fire_processing_instruction(target, null)
+                               return true
+                       else
+                               return false
+                       end
+               else if lexer.accept_s then
+                       var data: Buffer = new FlatBuffer
+
+                       loop
+                               if lexer.accept('?') then
+                                       if lexer.accept('>') then
+                                                       break
+                                       else
+                                               data.chars.push('?')
+                                               if not lexer.expect_xml_char(data) then return false
+                                       end
+                               else if not lexer.expect_xml_char(data) then
+                                       return false
+                               end
+                       end
+                       model.fire_processing_instruction(target, data.to_s)
+                       return true
+               else
+                       return lexer.fire_unexpected_char(" after a processing " +
+                                       "instruction target. Expecting a white space or `?>`")
+               end
+       end
+
+       # Expect a `PI | XMLDecl` production, without the beginning.
+       #
+       # Assume `last_char` is the third byte of the production that is, the
+       # next byte after the `'<?'` token.
+       private fun expect_pi_or_xml_decl: Bool do
+               var buffer: Buffer = new FlatBuffer
+
+               if lexer.expect_name(buffer) then
+                       var target = buffer.to_s
+
+                       if target == "xml" then
+                               return expect_xml_decl
+                       else if lexer.check_pi_target(target) then
+                               return expect_pi_data(target)
+                       else
+                               return false
+                       end
+               else
+                       return false
+               end
+       end
+
+       # Expect a `XMLDecl` production, without the initial `<?xml` token.
+       private fun expect_xml_decl: Bool do
+               if not expect_version_info then return false
+               if lexer.accept_s then
+                       if lexer.is_char('e') then
+                               if not expect_encoding_decl then return false
+                               # At this point, we can only accept `S` or `'?>'`.
+                               if not lexer.accept_s then
+                                       return lexer.expect_string("?>", "")
+                               end
+                       end
+                       if lexer.is_char('s') and not expect_sd_decl then return false
+                       return lexer.skip_s and lexer.expect_string("?>", "")
+               else
+                       return lexer.expect_string("?>", "")
+               end
+       end
+
+       # Expect a `EncodingDecl` token, without the initial `S` token.
+       private fun expect_encoding_decl: Bool do
+               var encoding = new FlatBuffer
+
+               if not lexer.expect_string("encoding", "") or not lexer.expect_eq or
+                               not expect_literal(encoding) then
+                       return false
+               end
+               if not encoding.has("^[A-Za-z][A-Za-z0-9._-]*$".to_re) then
+                       return model.fire_fatal_error("`{encoding.to_s}` is not a valid " +
+                                       "encoding name.", null)
+               end
+               # TODO: Do something with the value.
+               return true
+       end
+
+       # Expect a `SDDecl` token, without the initial `S` token.
+       private fun expect_sd_decl: Bool do
+               var buf = new FlatBuffer
+               var value: String
+
+               if not lexer.expect_string("standalone", "") or not lexer.expect_eq or
+                               not expect_literal(buf) then
+                       return false
+               end
+               value = buf.to_s
+               if not value == "yes" and not value == "no" then
+                       return model.fire_fatal_error("`{value}` is not a valid value for " +
+                                       "the `standalone` declaration. Expecting `yes` or `no`.",
+                                       null)
+               end
+               # TODO: Do something with the value.
+               return true
+       end
+
+       # Expect a `CDSect` production, without the beginning.
+       #
+       # Assume `last_char` is the fourth byte of the production that is, the
+       # next byte after the `'<!['` token.
+       private fun expect_cd_sect: Bool do
+               var buffer: Buffer = new FlatBuffer
+
+               # Number of consecutive closing brackets.
+               var closing: Int = 0
+
+               if lexer.expect_string("CDATA[",
+                               " at the beginning of a CDATA section.") then
+                       model.fire_start_cdata
+                       loop
+                               if lexer.accept(']') then
+                                       closing += 1
+                               else
+                                       for i in [0..closing[ do
+                                               buffer.chars.push(']')
+                                       end
+                                       closing = 0
+                                       if closing >= 2 and lexer.accept('>') then break
+                                       if not lexer.expect_xml_char(buffer) then return false
+                               end
+                       end
+                       flush(buffer)
+                       model.fire_end_cdata
+                       return true
+               else
+                       return false
+               end
+       end
+
+       # Expect a `VersionInfo` production.
+       private fun expect_version_info: Bool do
+               if not lexer.expect_s or
+                               not lexer.expect_string("version",
+                               " in the first attribute name of the XML declaration") or
+                               not lexer.expect_eq then
+                       return false
+               else
+                       var minor: Buffer = new FlatBuffer
+                       var delimiter = lexer.expect_delimiter
+
+                       if delimiter < 0 then return false
+                       if not lexer.expect_string("1.", " as XML major version") or
+                                       not lexer.expect_digits(minor) or
+                                       not lexer.expect_int(delimiter, "") then
+                               return false
+                       end
+                       if minor.to_s != "0" then
+                               model.fire_warning("Only XML 1.0 is supported. " +
+                                               "Got a XML 1.{minor.to_s} document.", null)
+                       end
+                       return true
+               end
+       end
+
+       # Expect a `Reference`, without the initial `&`.
+       #
+       # Append the value to the buffer.
+       private fun expect_reference(buffer: Buffer): Bool do
+               # TODO: [WFC: Entity Declared]
+               # TODO: [VC: Entity Declared]
+               # TODO: [WFC: Parsed Entity]
+               # TODO: [WFC: No Recursion]
+               # TODO: Unicode
+
+               var ref = new FlatBuffer
+
+               if lexer.accept('#') then
+                       if lexer.accept('x') then
+                               if lexer.expect_hex(ref) then
+                                       buffer.chars.add(ref.to_hex.ascii)
+                                       return lexer.expect(';', "")
+                               else
+                                       return lexer.fire_unexpected_char(
+                                                       ". Expecting an hexadecimal digit")
+                               end
+                       else if lexer.accept_digits(ref) then
+                               buffer.chars.add(ref.to_i.ascii)
+                               return lexer.expect(';', "")
+                       else
+                               return lexer.fire_unexpected_char(" in a character reference. " +
+                                               "Expecting `x` or a decimal digit")
+                       end
+               else if lexer.expect_name(ref) then
+                       var name = ref.to_s
+                       if name.has(":") then
+                               model.fire_error("The entity name `{name}` contains a colon.", null)
+                       end
+                       var value = resolve_reference(name)
+
+                       if value != null then
+                               buffer.append(value)
+                               return lexer.expect(';', "")
+                       else
+                               model.fire_fatal_error("Unknown entity `{name}`.", null)
+                               return false
+                       end
+               else
+                       return lexer.fire_unexpected_char(
+                                       " in a reference. Expecting `#` or a name")
+               end
+       end
+
+       # Resolve the entity reference or return `null`.
+       private fun resolve_reference(name: String): nullable String do
+               if name == "lt" then
+                       return "<"
+               else if name == "gt" then
+                       return ">"
+               else if name == "amp" then
+                       return "&"
+               else if name == "quot" then
+                       return "\""
+               else if name == "apos" then
+                       return "'"
+               else
+                       return null
+               end
+               # TODO: Support non-builtin entities
+       end
+
+       # Flush the specified buffer as a `characters` event.
+       #
+       # Do nothing if `buffer` is empty.
+       private fun flush(buffer: Buffer) do
+               if buffer.length > 0 then
+                       model.fire_characters(buffer.to_s)
+                       buffer.clear
+               end
+       end
+
+
+       ############################################################################
+       # Paths
+
+       # Resolve the specified system id.
+       private fun resolve_system_id(system_id: String): MaybeError[String, Error] do
+               return realpath(system_id)
+               # TODO: handle URIs
+       end
+
+       # Resolve the specified POSIX path.
+       #
+       # Like `String.realpath`, but with error handling.
+       private fun realpath(path: String): MaybeError[String, Error] do
+               var cs = path.to_cstring.file_realpath
+
+               if cs.address_is_null then
+                       return new MaybeError[String, Error](null,
+                                       new Error("File <{path}> not found."))
+               else
+                       return new MaybeError[String, Error](cs.to_s, null)
+               end
+       end
+end