A SAX 2 parser in Nit.

# A SAX 2 parser in Nit.
module saxophonit

import sax
intrude import core::file
private import reader_model
private import lexer

# Implementation of the `XMLReader` interface.
#
# For the moment, only XML 1.0 is (partially) supported.
#
# The following mandatory features of XML 1.0 are not yet supported:
#
# * Parsing of entities (files) encoded in UTF-16.
# * Encoding handling.
# * Entity references resolving (except for built-in references).
# * Handling of the options specified in the XML declaration.
# * Parsing of a `DOCTYPE` declaration.
#
# Also note that this XML processor is unable to retrieve a file from an URL
# (only local paths are supported).
#
# Usage example:
#
#     # Retrieve all text nodes.
#     class TextListener
#     	super ContentHandler
#     #
#     	private var buf: Buffer = new FlatBuffer
#     	private var sp: Bool = false
#     #
#     	redef fun characters(str: String) do
#     		if sp then
#     			if buf.length > 0 then buf.append(" ")
#     			sp = false
#     		end
#     		buf.append(str)
#     	end
#     #
#     	redef fun ignorable_whitespace(str: String) do
#     		sp = true
#     	end
#     #
#     	# Return the concatenation of all text nodes.
#     	redef fun to_s do return buf.to_s
#     end
#     #
#     var text = new TextListener
#     var reader = new XophonReader
#     #
#     reader.content_handler = text
#     reader.parse(new InputSource.with_stream(new StringReader("<foo>bar baz <n>42</n>.</foo>")))
#     assert text.to_s == "bar baz 42."
class XophonReader
	super XMLReader

	private var model = new XophonReaderModel
	private var lexer: XophonLexer is noinit

	redef fun entity_resolver do return model.entity_resolver
	redef fun entity_resolver=(entity_resolver) do
		model.entity_resolver = entity_resolver
	end

	redef fun dtd_handler do return model.dtd_handler
	redef fun dtd_handler=(dtd_handler) do
		model.dtd_handler = dtd_handler
	end

	redef fun content_handler do return model.content_handler
	redef fun content_handler=(content_handler) do
		model.content_handler = content_handler
	end

	redef fun error_handler do return model.error_handler
	redef fun error_handler=(error_handler) do
		model.error_handler = error_handler
	end


	redef fun feature_recognized(name) do
		return model.feature_recognized(name)
	end

	redef fun feature_readable(name) do
		return model.feature_readable(name)
	end

	redef fun feature_writable(name) do
		return model.feature_readable(name)
	end

	redef fun feature(name) do return model.feature(name)
	redef fun feature=(name, value) do model.feature(name) = value

	redef fun property_recognized(name) do
		return model.property_recognized(name)
	end

	redef fun property_readable(name) do
		return model.property_readable(name)
	end

	redef fun property_writable(name) do
		return model.property_writable(name)
	end

	redef fun property(name) do
		return model.property(name)
	end

	redef fun property=(name, value) do
		model.property(name) = value
	end

	redef fun parse(input) do
		var system_id: nullable MaybeError[String, Error] = null
		model.locator = new SAXLocatorImpl

		if input.system_id != null then
			system_id = resolve_system_id(input.system_id.as(not null))
			if system_id.is_error then
				model.fire_warning(system_id.error.message, system_id.error)
			else
				model.locator.system_id = system_id.value
			end
		end
		model.locator.public_id = input.public_id
		# TODO: encoding

		if input.stream != null then
			lexer = new XophonLexer(model, input.stream.as(not null))
			parse_main
		else if system_id != null then
			if system_id.is_error then
				model.fire_fatal_error("File <{input.system_id.as(not null)}> not found.", null)
			else
				lexer = new XophonLexer(model,
						new FileReader.open(system_id.value))
				parse_main
				lexer.close
			end
		else
			model.fire_fatal_error("At least a stream or a system identifier must be specified. None given.",
					null)
		end
	end

	redef fun parse_file(system_id) do
		parse(new InputSource.with_system_id(system_id))
	end


	############################################################################
	# Parsing

	# Note: Every `expect_*` function (except `parse_main`) does not call
	# `read_char` for the first byte and let the byte just after its production
	# in `last_char` (except in case of fatal error). They return `false` on
	# fatal error and at the end of the file.

	# Parse the main entity.
	private fun parse_main do
		model.fire_document_locator
		model.fire_start_document
		lexer.start
		expect_document
		model.fire_end_document
	end

	# Expect a `document` production.
	private fun expect_document: Bool do
		var got_doctype = false
		var got_element = false

		# If the document start with `<`, it may start with a XML declaration,
		# a processing instruction, a comment, a `DOCTYPE` declaration, the
		# root element or a white space.
		if lexer.accept('<') then
			if lexer.accept('?') then
				if not expect_pi_or_xml_decl then return false
			else if lexer.accept('!') then
				if lexer.accept('-') then
					if not lexer.expect('-',
								" at the beginning of a comment") or
								not expect_comment then
						return false
					end
				else
					if not expect_doctype_decl then return false
					got_doctype = true
				end
			else
				if not expect_root then return false
				# The `DOCTYPE` declaration *must* come before the root
				# element.
				got_doctype = true
				got_element = true
			end
		else if not lexer.accept_s then
			return lexer.fire_unexpected_char(
					". Expecting a white space or `<`")
		end

		# After the XML declaration (if there is one), the document may contain
		# processing instructions, comments, the `DOCTYPE` declaration and
		# the root element.
		# These productions may be separated by white space.
		while not got_element do
			if lexer.accept('<') then
				if lexer.accept('?') then
					if not expect_pi then return false
				else if lexer.accept('!') then
					if lexer.accept('-') then
						if not lexer.expect('-',
									" at the beginning of a comment") or
									not expect_comment then
							return false
						end
					else if got_doctype then
						return lexer.fire_unexpected_char(". Expecting `-`")
					else if expect_doctype_decl then
						got_doctype = true
					else
						return false
					end
				else
					if not expect_root then return false
					# The `DOCTYPE` declaration *must* come before the root
					# element.
					got_doctype = true
					got_element = true
				end
			else if not lexer.accept_s then
				return lexer.fire_unexpected_char(
						". Expecting a white space or `<`")
			end
		end
		return expect_miscs
	end

	private fun expect_doctype_decl: Bool do
		return model.fire_fatal_error("DTD not supported yet.\n", null) # TODO
	end

	# Expect the root `element` production, without the first `<` token.
	private fun expect_root: Bool do
		var success = true
		var char_data = new FlatBuffer

		success = expect_stag
		while success and not lexer.eof and not model.root_closed do
			success = expect_content_chunk(char_data)
		end
		if success then
			success = model.expect_root_closed
		end
		flush(char_data)
		return success
	end

	# Parse a `EmptyElemTag | STag | ETag | Reference | CDSect | PI | Comment | CharData` production.
	#
	# If the last read byte matches the `CharData` production, push the char in
	# `char_data`. Else, flush `CharData` as a `characters` event.
	private fun expect_content_chunk(char_data: Buffer): Bool do
		if lexer.accept('<') then
			flush(char_data)
			if lexer.accept('!') then
				if lexer.accept('-') then
					return lexer.expect('-',
							" at the beginning of a comment") and
							expect_comment
				else if lexer.accept('[') then
					return expect_cd_sect
				else
					return lexer.fire_unexpected_char(
							". Expecting `--` or `[CDATA[`")
				end
			else if lexer.accept('?') then
				return expect_pi
			else if lexer.accept('/') then
				return expect_etag
			else
				return expect_stag
			end
		else if lexer.accept('&') then
			flush(char_data)
			var success = expect_reference(char_data)
			flush(char_data)
			return success
		else
			return lexer.expect_xml_char(char_data)
		end
	end

	# Expect a `EmptyElemTag | STag` production, without the initial `<`.
	private fun expect_stag: Bool do
		var name_buffer = new FlatBuffer

		if lexer.expect_name(name_buffer) then
			var name = name_buffer.to_s

			model.fire_start_attributes
			loop
				if lexer.accept('>') then
					model.fire_start_element(name)
					return true
				else if lexer.accept('/') then
					if lexer.expect('>', "") then
						model.fire_start_element(name)
						model.fire_end_element(name)
						return true
					else
						return false
					end
				else if lexer.expect_s then
					if lexer.accept('/') then
						if lexer.expect('>', "") then
							model.fire_start_element(name)
							model.fire_end_element(name)
							return true
						else
							return false
						end
					else if lexer.accept('>') then
						model.fire_start_element(name)
						return true
					else if not expect_attribute then
						return false
					end
				else
					return lexer.fire_unexpected_char(" in tag. " +
							"Expecting an attribute, `/`, `>` or white space")
				end
			end
		end
		return false
	end

	# Expect a `ETag` production, without the initial `</`.
	private fun expect_etag: Bool do
		var name_buf = new FlatBuffer

		if lexer.expect_name(name_buf) and
				lexer.skip_s and
				lexer.expect('>', "") then
			return model.fire_end_element(name_buf.to_s)
		else
			return false
		end
	end

	# Expect an `Attributes` production.
	private fun expect_attribute: Bool do
		var name = new FlatBuffer
		var value = new FlatBuffer

		if lexer.expect_name(name) and
				lexer.expect_eq and
				expect_att_value(value) then
			model.fire_attribute(name.to_s, value.to_s)
			return true
		else
			return false
		end
	end

	# Expect the `Misc*` production at the end of a document.
	private fun expect_miscs: Bool do
		while not lexer.eof do
			if lexer.accept('<') then
				if lexer.accept('?') then
					if not expect_pi then return false
				else if lexer.accept('!') then
					if not lexer.expect_string("--",
							" at the beginning of a comment") or
							not expect_comment then
						return false
					end
				else
					return lexer.fire_unexpected_char(". Expecting `?` or `!`")
				end
			else if not lexer.accept_s then
				return lexer.fire_unexpected_char(
						". Expecting a white space or `<`")
			end
		end
		return true
	end

	# Expect a `AttValue` production.
	#
	# Append the parsed value to `buffer`.
	private fun expect_att_value(buffer: Buffer): Bool do
		var delimiter = lexer.expect_delimiter

		if delimiter < 0 then return false
		loop
			if lexer.accept_int(delimiter) then
				return true
			else if lexer.accept('&') then
				# TODO: [WFC: No < in Attribute Values]
				if not expect_reference(buffer) then return false
			else if not lexer.expect_att_value_char(buffer) then
				return false
			end
		end
	end

	# Expect a `SystemLiteral` production.
	#
	# Also used to parse productions that do not have references.
	# Append the parsed value to `buffer`.
	private fun expect_literal(buffer: Buffer): Bool do
		var delimiter = lexer.expect_delimiter

		if delimiter < 0 then return false
		loop
			if lexer.accept_int(delimiter) then
				return true
			else if not lexer.expect_xml_char(buffer) then
				return false
			end
		end
	end


	# Expect a `Comment` production, without the beginning.
	#
	# Assume `last_char` is the fifth byte of the production that is, the
	# next byte after the `'<!--'` token.
	private fun expect_comment: Bool do
		var buffer: Buffer = new FlatBuffer

		loop
			if lexer.accept('-') then
				if lexer.accept('-') then
					if not lexer.expect('>',
							" after a double-hyphen (`--`) in a comment") then
						return false
					else
						break
					end
				else
					buffer.chars.push('-')
					if not lexer.expect_xml_char(buffer) then return false
				end
			else if not lexer.expect_xml_char(buffer) then
				return false
			end
		end
		model.fire_comment(buffer.to_s)
		return true
	end

	# Expect a `PI` production, without the beginning.
	#
	# Assume `last_char` is the third byte of the production that is, the
	# next byte after the `'<?'` token.
	private fun expect_pi: Bool do
		var target = new FlatBuffer

		return lexer.expect_pi_target(target) and
				expect_pi_data(target.to_s)
	end

	# Expect the data part and the `'?>'` token of a `PI` production.
	private fun expect_pi_data(target: String): Bool do
		if lexer.accept('?') then
			if lexer.expect('>', " at the end of a processing instruction") then
				model.fire_processing_instruction(target, null)
				return true
			else
				return false
			end
		else if lexer.accept_s then
			var data: Buffer = new FlatBuffer

			loop
				if lexer.accept('?') then
					if lexer.accept('>') then
							break
					else
						data.chars.push('?')
						if not lexer.expect_xml_char(data) then return false
					end
				else if not lexer.expect_xml_char(data) then
					return false
				end
			end
			model.fire_processing_instruction(target, data.to_s)
			return true
		else
			return lexer.fire_unexpected_char(" after a processing " +
					"instruction target. Expecting a white space or `?>`")
		end
	end

	# Expect a `PI | XMLDecl` production, without the beginning.
	#
	# Assume `last_char` is the third byte of the production that is, the
	# next byte after the `'<?'` token.
	private fun expect_pi_or_xml_decl: Bool do
		var buffer: Buffer = new FlatBuffer

		if lexer.expect_name(buffer) then
			var target = buffer.to_s

			if target == "xml" then
				return expect_xml_decl
			else if lexer.check_pi_target(target) then
				return expect_pi_data(target)
			else
				return false
			end
		else
			return false
		end
	end

	# Expect a `XMLDecl` production, without the initial `<?xml` token.
	private fun expect_xml_decl: Bool do
		if not expect_version_info then return false
		if lexer.accept_s then
			if lexer.is_char('e') then
				if not expect_encoding_decl then return false
				# At this point, we can only accept `S` or `'?>'`.
				if not lexer.accept_s then
					return lexer.expect_string("?>", "")
				end
			end
			if lexer.is_char('s') and not expect_sd_decl then return false
			return lexer.skip_s and lexer.expect_string("?>", "")
		else
			return lexer.expect_string("?>", "")
		end
	end

	# Expect a `EncodingDecl` token, without the initial `S` token.
	private fun expect_encoding_decl: Bool do
		var encoding = new FlatBuffer

		if not lexer.expect_string("encoding", "") or not lexer.expect_eq or
				not expect_literal(encoding) then
			return false
		end
		if not encoding.has("^[A-Za-z][A-Za-z0-9._-]*$".to_re) then
			return model.fire_fatal_error("`{encoding.to_s}` is not a valid " +
					"encoding name.", null)
		end
		# TODO: Do something with the value.
		return true
	end

	# Expect a `SDDecl` token, without the initial `S` token.
	private fun expect_sd_decl: Bool do
		var buf = new FlatBuffer
		var value: String

		if not lexer.expect_string("standalone", "") or not lexer.expect_eq or
				not expect_literal(buf) then
			return false
		end
		value = buf.to_s
		if not value == "yes" and not value == "no" then
			return model.fire_fatal_error("`{value}` is not a valid value for " +
					"the `standalone` declaration. Expecting `yes` or `no`.",
					null)
		end
		# TODO: Do something with the value.
		return true
	end

	# Expect a `CDSect` production, without the beginning.
	#
	# Assume `last_char` is the fourth byte of the production that is, the
	# next byte after the `'<!['` token.
	private fun expect_cd_sect: Bool do
		var buffer: Buffer = new FlatBuffer

		# Number of consecutive closing brackets.
		var closing = 0

		if lexer.expect_string("CDATA[",
				" at the beginning of a CDATA section.") then
			model.fire_start_cdata
			loop
				if lexer.accept(']') then
					closing += 1
				else
					for i in [0..closing[ do
						buffer.chars.push(']')
					end
					closing = 0
					if closing >= 2 and lexer.accept('>') then break
					if not lexer.expect_xml_char(buffer) then return false
				end
			end
			flush(buffer)
			model.fire_end_cdata
			return true
		else
			return false
		end
	end

	# Expect a `VersionInfo` production.
	private fun expect_version_info: Bool do
		if not lexer.expect_s or
				not lexer.expect_string("version",
				" in the first attribute name of the XML declaration") or
				not lexer.expect_eq then
			return false
		else
			var minor: Buffer = new FlatBuffer
			var delimiter = lexer.expect_delimiter

			if delimiter < 0 then return false
			if not lexer.expect_string("1.", " as XML major version") or
					not lexer.expect_digits(minor) or
					not lexer.expect_int(delimiter, "") then
				return false
			end
			if minor.to_s != "0" then
				model.fire_warning("Only XML 1.0 is supported. " +
						"Got a XML 1.{minor.to_s} document.", null)
			end
			return true
		end
	end

	# Expect a `Reference`, without the initial `&`.
	#
	# Append the value to the buffer.
	private fun expect_reference(buffer: Buffer): Bool do
		# TODO: [WFC: Entity Declared]
		# TODO: [VC: Entity Declared]
		# TODO: [WFC: Parsed Entity]
		# TODO: [WFC: No Recursion]
		# TODO: Unicode

		var ref = new FlatBuffer

		if lexer.accept('#') then
			if lexer.accept('x') then
				if lexer.expect_hex(ref) then
					buffer.chars.add(ref.to_hex.code_point)
					return lexer.expect(';', "")
				else
					return lexer.fire_unexpected_char(
							". Expecting an hexadecimal digit")
				end
			else if lexer.accept_digits(ref) then
				buffer.chars.add(ref.to_i.code_point)
				return lexer.expect(';', "")
			else
				return lexer.fire_unexpected_char(" in a character reference. " +
						"Expecting `x` or a decimal digit")
			end
		else if lexer.expect_name(ref) then
			var name = ref.to_s
			if name.has(":") then
				model.fire_error("The entity name `{name}` contains a colon.", null)
			end
			var value = resolve_reference(name)

			if value != null then
				buffer.append(value)
				return lexer.expect(';', "")
			else
				model.fire_fatal_error("Unknown entity `{name}`.", null)
				return false
			end
		else
			return lexer.fire_unexpected_char(
					" in a reference. Expecting `#` or a name")
		end
	end

	# Resolve the entity reference or return `null`.
	private fun resolve_reference(name: String): nullable String do
		if name == "lt" then
			return "<"
		else if name == "gt" then
			return ">"
		else if name == "amp" then
			return "&"
		else if name == "quot" then
			return "\""
		else if name == "apos" then
			return "'"
		else
			return null
		end
		# TODO: Support non-builtin entities
	end

	# Flush the specified buffer as a `characters` event.
	#
	# Do nothing if `buffer` is empty.
	private fun flush(buffer: Buffer) do
		if buffer.length > 0 then
			model.fire_characters(buffer.to_s)
			buffer.clear
		end
	end


	############################################################################
	# Paths

	# Resolve the specified system id.
	private fun resolve_system_id(system_id: String): MaybeError[String, Error] do
		return realpath(system_id)
		# TODO: handle URIs
	end

	# Resolve the specified POSIX path.
	#
	# Like `String.realpath`, but with error handling.
	private fun realpath(path: String): MaybeError[String, Error] do
		var cs = path.to_cstring.file_realpath

		if cs.address_is_null then
			return new MaybeError[String, Error](null,
					new Error("File <{path}> not found."))
		else
			return new MaybeError[String, Error](cs.to_s, null)
		end
	end
end

lib/saxophonit/saxophonit.nit:11,1--743,3

module saxophonit

Summary

A SAX 2 parser in Nit.

Introduced classes

XophonReader

A SAX 2 parser in Nit.

Introduced classes

class XophonReader

Summary

All class definitions

XophonReader

All class definitions

class XophonReader

Summary

Ancestors

abstract_collection

abstract_text

array

attributes

attributes_impl

bitset

bytes

circular_array

codec_base

codecs

collection

content_handler

core

dtd_handler

entity_resolver

environ

error

error_handler

exec

file

fixed_ints

fixed_ints_text

flat

gc

hash_collection

input_source

iso8859_1

kernel

list

math

namespace_support

native

numeric

protocol

queue

range

re

reader_model

ropes

sax

sax_locator

sax_locator_impl

sax_parse_exception

sorter

stream

string_search

text

time

union_find

utf8

xml_filter

xml_reader

Parents

lexer

Descendants

a_star-m

Ancestors

module abstract_collection

module abstract_text

module array

module attributes

module attributes_impl

module bitset

module bytes

module circular_array