X-Git-Url: http://nitlanguage.org diff --git a/lib/saxophonit/saxophonit.nit b/lib/saxophonit/saxophonit.nit index 22d4314..6b3f616 100644 --- a/lib/saxophonit/saxophonit.nit +++ b/lib/saxophonit/saxophonit.nit @@ -10,3 +10,734 @@ # A SAX 2 parser in Nit. module saxophonit + +import sax +intrude import standard::file +private import reader_model +private import lexer + +# Implementation of the `XMLReader` interface. +# +# For the moment, only XML 1.0 is (partially) supported. +# +# The following mandatory features of XML 1.0 are not yet supported: +# +# * Parsing of entities (files) encoded in UTF-16. +# * Encoding handling. +# * Entity references resolving (except for built-in references). +# * Handling of the options specified in the XML declaration. +# * Parsing of a `DOCTYPE` declaration. +# +# Also note that this XML processor is unable to retrieve a file from an URL +# (only local paths are supported). +# +# Usage example: +# +# # Retrieve all text nodes. +# class TextListener +# super ContentHandler +# # +# private var buf: Buffer = new FlatBuffer +# private var sp: Bool = false +# # +# redef fun characters(str: String) do +# if sp then +# if buf.length > 0 then buf.append(" ") +# sp = false +# end +# buf.append(str) +# end +# # +# redef fun ignorable_whitespace(str: String) do +# sp = true +# end +# # +# # Return the concatenation of all text nodes. +# redef fun to_s do return buf.to_s +# end +# # +# var text = new TextListener +# var reader = new XophonReader +# # +# reader.content_handler = text +# reader.parse(new InputSource.with_stream(new StringReader("bar baz 42."))) +# assert text.to_s == "bar baz 42." +class XophonReader + super XMLReader + + private var model = new XophonReaderModel + private var lexer: XophonLexer is noinit + + redef fun entity_resolver: nullable EntityResolver do return model.entity_resolver + redef fun entity_resolver=(entity_resolver: nullable EntityResolver) do + model.entity_resolver = entity_resolver + end + + redef fun dtd_handler: nullable DTDHandler do return model.dtd_handler + redef fun dtd_handler=(dtd_handler: nullable DTDHandler) do + model.dtd_handler = dtd_handler + end + + redef fun content_handler: nullable ContentHandler do return model.content_handler + redef fun content_handler=(content_handler: nullable ContentHandler) do + model.content_handler = content_handler + end + + redef fun error_handler: nullable ErrorHandler do return model.error_handler + redef fun error_handler=(error_handler: nullable ErrorHandler) do + model.error_handler = error_handler + end + + + redef fun feature_recognized(name: String): Bool do + return model.feature_recognized(name) + end + + redef fun feature_readable(name: String): Bool do + return model.feature_readable(name) + end + + redef fun feature_writable(name: String): Bool do + return model.feature_readable(name) + end + + redef fun feature(name: String): Bool do return model.feature(name) + redef fun feature=(name: String, value: Bool) do model.feature(name) = value + + redef fun property_recognized(name: String): Bool do + return model.property_recognized(name) + end + + redef fun property_readable(name: String): Bool do + return model.property_readable(name) + end + + redef fun property_writable(name: String): Bool do + return model.property_writable(name) + end + + redef fun property(name: String): nullable Object do + return model.property(name) + end + + redef fun property=(name: String, value: nullable Object) do + model.property(name) = value + end + + redef fun parse(input: InputSource) do + var system_id: nullable MaybeError[String, Error] = null + model.locator = new SAXLocatorImpl + + if input.system_id != null then + system_id = resolve_system_id(input.system_id.as(not null)) + if system_id.is_error then + model.fire_warning(system_id.error.message, system_id.error) + else + model.locator.system_id = system_id.value + end + end + model.locator.public_id = input.public_id + # TODO: encoding + + if input.stream != null then + lexer = new XophonLexer(model, input.stream.as(not null)) + parse_main + else if system_id != null then + if system_id.is_error then + model.fire_fatal_error("File <{input.system_id.as(not null)}> not found.", null) + else + lexer = new XophonLexer(model, + new FileReader.open(system_id.value)) + parse_main + lexer.close + end + else + model.fire_fatal_error("At least a stream or a system identifier must be specified. None given.", + null) + end + end + + redef fun parse_file(system_id: String) do + parse(new InputSource.with_system_id(system_id)) + end + + + ############################################################################ + # Parsing + + # Note: Every `expect_*` function (except `parse_main`) does not call + # `read_char` for the first byte and let the byte just after its production + # in `last_char` (except in case of fatal error). They return `false` on + # fatal error and at the end of the file. + + # Parse the main entity. + private fun parse_main do + model.fire_document_locator + model.fire_start_document + lexer.start + expect_document + model.fire_end_document + end + + # Expect a `document` production. + private fun expect_document: Bool do + var got_doctype = false + var got_element = false + + # If the document start with `<`, it may start with a XML declaration, + # a processing instruction, a comment, a `DOCTYPE` declaration, the + # root element or a white space. + if lexer.accept('<') then + if lexer.accept('?') then + if not expect_pi_or_xml_decl then return false + else if lexer.accept('!') then + if lexer.accept('-') then + if not lexer.expect('-', + " at the beginning of a comment") or + not expect_comment then + return false + end + else + if not expect_doctype_decl then return false + got_doctype = true + end + else + if not expect_root then return false + # The `DOCTYPE` declaration *must* come before the root + # element. + got_doctype = true + got_element = true + end + else if not lexer.accept_s then + return lexer.fire_unexpected_char( + ". Expecting a white space or `<`") + end + + # After the XML declaration (if there is one), the document may contain + # processing instructions, comments, the `DOCTYPE` declaration and + # the root element. + # These productions may be separated by white space. + while not got_element do + if lexer.accept('<') then + if lexer.accept('?') then + if not expect_pi then return false + else if lexer.accept('!') then + if lexer.accept('-') then + if not lexer.expect('-', + " at the beginning of a comment") or + not expect_comment then + return false + end + else if got_doctype then + return lexer.fire_unexpected_char(". Expecting `-`") + else if expect_doctype_decl then + got_doctype = true + else + return false + end + else + if not expect_root then return false + # The `DOCTYPE` declaration *must* come before the root + # element. + got_doctype = true + got_element = true + end + else if not lexer.accept_s then + return lexer.fire_unexpected_char( + ". Expecting a white space or `<`") + end + end + return expect_miscs + end + + private fun expect_doctype_decl: Bool do + return model.fire_fatal_error("DTD not supported yet.\n", null) # TODO + end + + # Expect the root `element` production, without the first `<` token. + private fun expect_root: Bool do + var success = true + var char_data = new FlatBuffer + + success = expect_stag + while success and not lexer.eof and not model.root_closed do + success = expect_content_chunk(char_data) + end + if success then + success = model.expect_root_closed + end + flush(char_data) + return success + end + + # Parse a `EmptyElemTag | STag | ETag | Reference | CDSect | PI | Comment | CharData` production. + # + # If the last read byte matches the `CharData` production, push the char in + # `char_data`. Else, flush `CharData` as a `characters` event. + private fun expect_content_chunk(char_data: Buffer): Bool do + if lexer.accept('<') then + flush(char_data) + if lexer.accept('!') then + if lexer.accept('-') then + return lexer.expect('-', + " at the beginning of a comment") and + expect_comment + else if lexer.accept('[') then + return expect_cd_sect + else + return lexer.fire_unexpected_char( + ". Expecting `--` or `[CDATA[`") + end + else if lexer.accept('?') then + return expect_pi + else if lexer.accept('/') then + return expect_etag + else + return expect_stag + end + else if lexer.accept('&') then + flush(char_data) + var success = expect_reference(char_data) + flush(char_data) + return success + else + return lexer.expect_xml_char(char_data) + end + end + + # Expect a `EmptyElemTag | STag` production, without the initial `<`. + private fun expect_stag: Bool do + var name_buffer = new FlatBuffer + + if lexer.expect_name(name_buffer) then + var name = name_buffer.to_s + + model.fire_start_attributes + loop + if lexer.accept('>') then + model.fire_start_element(name) + return true + else if lexer.accept('/') then + if lexer.expect('>', "") then + model.fire_start_element(name) + model.fire_end_element(name) + return true + else + return false + end + else if lexer.expect_s then + if lexer.accept('/') then + if lexer.expect('>', "") then + model.fire_start_element(name) + model.fire_end_element(name) + return true + else + return false + end + else if lexer.accept('>') then + model.fire_start_element(name) + return true + else if not expect_attribute then + return false + end + else + return lexer.fire_unexpected_char(" in tag. " + + "Expecting an attribute, `/`, `>` or white space") + end + end + end + return false + end + + # Expect a `ETag` production, without the initial `', "") then + return model.fire_end_element(name_buf.to_s) + else + return false + end + end + + # Expect an `Attributes` production. + private fun expect_attribute: Bool do + var name = new FlatBuffer + var value = new FlatBuffer + + if lexer.expect_name(name) and + lexer.expect_eq and + expect_att_value(value) then + model.fire_attribute(name.to_s, value.to_s) + return true + else + return false + end + end + + # Expect the `Misc*` production at the end of a document. + private fun expect_miscs: Bool do + while not lexer.eof do + if lexer.accept('<') then + if lexer.accept('?') then + if not expect_pi then return false + else if lexer.accept('!') then + if not lexer.expect_string("--", + " at the beginning of a comment") or + not expect_comment then + return false + end + else + return lexer.fire_unexpected_char(". Expecting `?` or `!`") + end + else if not lexer.accept_s then + return lexer.fire_unexpected_char( + ". Expecting a white space or `<`") + end + end + return true + end + + # Expect a `AttValue` production. + # + # Append the parsed value to `buffer`. + private fun expect_att_value(buffer: Buffer): Bool do + var delimiter = lexer.expect_delimiter + + if delimiter < 0 then return false + loop + if lexer.accept_int(delimiter) then + return true + else if lexer.accept('&') then + # TODO: [WFC: No < in Attribute Values] + if not expect_reference(buffer) then return false + else if not lexer.expect_att_value_char(buffer) then + return false + end + end + end + + # Expect a `SystemLiteral` production. + # + # Also used to parse productions that do not have references. + # Append the parsed value to `buffer`. + private fun expect_literal(buffer: Buffer): Bool do + var delimiter = lexer.expect_delimiter + + if delimiter < 0 then return false + loop + if lexer.accept_int(delimiter) then + return true + else if not lexer.expect_xml_char(buffer) then + return false + end + end + end + + + # Expect a `Comment` production, without the beginning. + # + # Assume `last_char` is the fifth byte of the production that is, the + # next byte after the `'