# This file is part of NIT ( http://www.nitlanguage.org ). # # This file is free software, which comes along with NIT. This software is # distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; # without even the implied warranty of MERCHANTABILITY or FITNESS FOR A # PARTICULAR PURPOSE. You can modify it is you want, provided this header # is kept unaltered, and a notification of the changes is added. # You are allowed to redistribute it and sell it, alone or is a part of # another product. # A SAX 2 parser in Nit. module saxophonit import sax intrude import core::file private import reader_model private import lexer # Implementation of the `XMLReader` interface. # # For the moment, only XML 1.0 is (partially) supported. # # The following mandatory features of XML 1.0 are not yet supported: # # * Parsing of entities (files) encoded in UTF-16. # * Encoding handling. # * Entity references resolving (except for built-in references). # * Handling of the options specified in the XML declaration. # * Parsing of a `DOCTYPE` declaration. # # Also note that this XML processor is unable to retrieve a file from an URL # (only local paths are supported). # # Usage example: # # # Retrieve all text nodes. # class TextListener # super ContentHandler # # # private var buf: Buffer = new FlatBuffer # private var sp: Bool = false # # # redef fun characters(str: String) do # if sp then # if buf.length > 0 then buf.append(" ") # sp = false # end # buf.append(str) # end # # # redef fun ignorable_whitespace(str: String) do # sp = true # end # # # # Return the concatenation of all text nodes. # redef fun to_s do return buf.to_s # end # # # var text = new TextListener # var reader = new XophonReader # # # reader.content_handler = text # reader.parse(new InputSource.with_stream(new StringReader("bar baz 42."))) # assert text.to_s == "bar baz 42." class XophonReader super XMLReader private var model = new XophonReaderModel private var lexer: XophonLexer is noinit redef fun entity_resolver do return model.entity_resolver redef fun entity_resolver=(entity_resolver) do model.entity_resolver = entity_resolver end redef fun dtd_handler do return model.dtd_handler redef fun dtd_handler=(dtd_handler) do model.dtd_handler = dtd_handler end redef fun content_handler do return model.content_handler redef fun content_handler=(content_handler) do model.content_handler = content_handler end redef fun error_handler do return model.error_handler redef fun error_handler=(error_handler) do model.error_handler = error_handler end redef fun feature_recognized(name) do return model.feature_recognized(name) end redef fun feature_readable(name) do return model.feature_readable(name) end redef fun feature_writable(name) do return model.feature_readable(name) end redef fun feature(name) do return model.feature(name) redef fun feature=(name, value) do model.feature(name) = value redef fun property_recognized(name) do return model.property_recognized(name) end redef fun property_readable(name) do return model.property_readable(name) end redef fun property_writable(name) do return model.property_writable(name) end redef fun property(name) do return model.property(name) end redef fun property=(name, value) do model.property(name) = value end redef fun parse(input) do var system_id: nullable MaybeError[String, Error] = null model.locator = new SAXLocatorImpl if input.system_id != null then system_id = resolve_system_id(input.system_id.as(not null)) if system_id.is_error then model.fire_warning(system_id.error.message, system_id.error) else model.locator.system_id = system_id.value end end model.locator.public_id = input.public_id # TODO: encoding if input.stream != null then lexer = new XophonLexer(model, input.stream.as(not null)) parse_main else if system_id != null then if system_id.is_error then model.fire_fatal_error("File <{input.system_id.as(not null)}> not found.", null) else lexer = new XophonLexer(model, new FileReader.open(system_id.value)) parse_main lexer.close end else model.fire_fatal_error("At least a stream or a system identifier must be specified. None given.", null) end end redef fun parse_file(system_id) do parse(new InputSource.with_system_id(system_id)) end ############################################################################ # Parsing # Note: Every `expect_*` function (except `parse_main`) does not call # `read_char` for the first byte and let the byte just after its production # in `last_char` (except in case of fatal error). They return `false` on # fatal error and at the end of the file. # Parse the main entity. private fun parse_main do model.fire_document_locator model.fire_start_document lexer.start expect_document model.fire_end_document end # Expect a `document` production. private fun expect_document: Bool do var got_doctype = false var got_element = false # If the document start with `<`, it may start with a XML declaration, # a processing instruction, a comment, a `DOCTYPE` declaration, the # root element or a white space. if lexer.accept('<') then if lexer.accept('?') then if not expect_pi_or_xml_decl then return false else if lexer.accept('!') then if lexer.accept('-') then if not lexer.expect('-', " at the beginning of a comment") or not expect_comment then return false end else if not expect_doctype_decl then return false got_doctype = true end else if not expect_root then return false # The `DOCTYPE` declaration *must* come before the root # element. got_doctype = true got_element = true end else if not lexer.accept_s then return lexer.fire_unexpected_char( ". Expecting a white space or `<`") end # After the XML declaration (if there is one), the document may contain # processing instructions, comments, the `DOCTYPE` declaration and # the root element. # These productions may be separated by white space. while not got_element do if lexer.accept('<') then if lexer.accept('?') then if not expect_pi then return false else if lexer.accept('!') then if lexer.accept('-') then if not lexer.expect('-', " at the beginning of a comment") or not expect_comment then return false end else if got_doctype then return lexer.fire_unexpected_char(". Expecting `-`") else if expect_doctype_decl then got_doctype = true else return false end else if not expect_root then return false # The `DOCTYPE` declaration *must* come before the root # element. got_doctype = true got_element = true end else if not lexer.accept_s then return lexer.fire_unexpected_char( ". Expecting a white space or `<`") end end return expect_miscs end private fun expect_doctype_decl: Bool do return model.fire_fatal_error("DTD not supported yet.\n", null) # TODO end # Expect the root `element` production, without the first `<` token. private fun expect_root: Bool do var success = true var char_data = new FlatBuffer success = expect_stag while success and not lexer.eof and not model.root_closed do success = expect_content_chunk(char_data) end if success then success = model.expect_root_closed end flush(char_data) return success end # Parse a `EmptyElemTag | STag | ETag | Reference | CDSect | PI | Comment | CharData` production. # # If the last read byte matches the `CharData` production, push the char in # `char_data`. Else, flush `CharData` as a `characters` event. private fun expect_content_chunk(char_data: Buffer): Bool do if lexer.accept('<') then flush(char_data) if lexer.accept('!') then if lexer.accept('-') then return lexer.expect('-', " at the beginning of a comment") and expect_comment else if lexer.accept('[') then return expect_cd_sect else return lexer.fire_unexpected_char( ". Expecting `--` or `[CDATA[`") end else if lexer.accept('?') then return expect_pi else if lexer.accept('/') then return expect_etag else return expect_stag end else if lexer.accept('&') then flush(char_data) var success = expect_reference(char_data) flush(char_data) return success else return lexer.expect_xml_char(char_data) end end # Expect a `EmptyElemTag | STag` production, without the initial `<`. private fun expect_stag: Bool do var name_buffer = new FlatBuffer if lexer.expect_name(name_buffer) then var name = name_buffer.to_s model.fire_start_attributes loop if lexer.accept('>') then model.fire_start_element(name) return true else if lexer.accept('/') then if lexer.expect('>', "") then model.fire_start_element(name) model.fire_end_element(name) return true else return false end else if lexer.expect_s then if lexer.accept('/') then if lexer.expect('>', "") then model.fire_start_element(name) model.fire_end_element(name) return true else return false end else if lexer.accept('>') then model.fire_start_element(name) return true else if not expect_attribute then return false end else return lexer.fire_unexpected_char(" in tag. " + "Expecting an attribute, `/`, `>` or white space") end end end return false end # Expect a `ETag` production, without the initial `', "") then return model.fire_end_element(name_buf.to_s) else return false end end # Expect an `Attributes` production. private fun expect_attribute: Bool do var name = new FlatBuffer var value = new FlatBuffer if lexer.expect_name(name) and lexer.expect_eq and expect_att_value(value) then model.fire_attribute(name.to_s, value.to_s) return true else return false end end # Expect the `Misc*` production at the end of a document. private fun expect_miscs: Bool do while not lexer.eof do if lexer.accept('<') then if lexer.accept('?') then if not expect_pi then return false else if lexer.accept('!') then if not lexer.expect_string("--", " at the beginning of a comment") or not expect_comment then return false end else return lexer.fire_unexpected_char(". Expecting `?` or `!`") end else if not lexer.accept_s then return lexer.fire_unexpected_char( ". Expecting a white space or `<`") end end return true end # Expect a `AttValue` production. # # Append the parsed value to `buffer`. private fun expect_att_value(buffer: Buffer): Bool do var delimiter = lexer.expect_delimiter if delimiter < 0 then return false loop if lexer.accept_int(delimiter) then return true else if lexer.accept('&') then # TODO: [WFC: No < in Attribute Values] if not expect_reference(buffer) then return false else if not lexer.expect_att_value_char(buffer) then return false end end end # Expect a `SystemLiteral` production. # # Also used to parse productions that do not have references. # Append the parsed value to `buffer`. private fun expect_literal(buffer: Buffer): Bool do var delimiter = lexer.expect_delimiter if delimiter < 0 then return false loop if lexer.accept_int(delimiter) then return true else if not lexer.expect_xml_char(buffer) then return false end end end # Expect a `Comment` production, without the beginning. # # Assume `last_char` is the fifth byte of the production that is, the # next byte after the `'