SAXophoNit’s lexer

Introduced classes

class XophonLexer

saxophonit :: XophonLexer

SAXophoNit’s lexer

All class definitions

class XophonLexer

saxophonit $ XophonLexer

SAXophoNit’s lexer
package_diagram saxophonit::lexer lexer saxophonit::reader_model reader_model saxophonit::lexer->saxophonit::reader_model sax sax saxophonit::reader_model->sax sax::sax_locator_impl sax_locator_impl saxophonit::reader_model->sax::sax_locator_impl sax::attributes_impl attributes_impl saxophonit::reader_model->sax::attributes_impl sax::namespace_support namespace_support saxophonit::reader_model->sax::namespace_support ...sax ... ...sax->sax ...sax::sax_locator_impl ... ...sax::sax_locator_impl->sax::sax_locator_impl ...sax::attributes_impl ... ...sax::attributes_impl->sax::attributes_impl ...sax::namespace_support ... ...sax::namespace_support->sax::namespace_support saxophonit::saxophonit saxophonit saxophonit::saxophonit->saxophonit::lexer

Ancestors

module abstract_collection

core :: abstract_collection

Abstract collection classes and services.
module abstract_text

core :: abstract_text

Abstract class for manipulation of sequences of characters
module array

core :: array

This module introduces the standard array structure.
module attributes

sax :: attributes

Interface for a list of XML attributes.
module attributes_impl

sax :: attributes_impl

Default implementation of the Attributes interface.
module bitset

core :: bitset

Services to handle BitSet
module bytes

core :: bytes

Services for byte streams and arrays
module circular_array

core :: circular_array

Efficient data structure to access both end of the sequence.
module codec_base

core :: codec_base

Base for codecs to use with streams
module codecs

core :: codecs

Group module for all codec-related manipulations
module collection

core :: collection

This module define several collection classes.
module content_handler

sax :: content_handler

Receives notification of the logical content of a document.
module core

core :: core

Standard classes and methods used by default by Nit programs and libraries.
module dtd_handler

sax :: dtd_handler

Receives notification of basic DTD-related events.
module entity_resolver

sax :: entity_resolver

Basic interface for resolving entities.
module environ

core :: environ

Access to the environment variables of the process
module error

core :: error

Standard error-management infrastructure.
module error_handler

sax :: error_handler

Basic interface for SAX error handlers.
module exec

core :: exec

Invocation and management of operating system sub-processes.
module file

core :: file

File manipulations (create, read, write, etc.)
module fixed_ints

core :: fixed_ints

Basic integers of fixed-precision
module fixed_ints_text

core :: fixed_ints_text

Text services to complement fixed_ints
module flat

core :: flat

All the array-based text representations
module gc

core :: gc

Access to the Nit internal garbage collection mechanism
module hash_collection

core :: hash_collection

Introduce HashMap and HashSet.
module input_source

sax :: input_source

A single input source for an XML entity.
module iso8859_1

core :: iso8859_1

Codec for ISO8859-1 I/O
module kernel

core :: kernel

Most basic classes and methods.
module list

core :: list

This module handle double linked lists
module math

core :: math

Mathematical operations
module namespace_support

sax :: namespace_support

Encapsulates Namespace logic for use by applications using SAX, or internally by SAX drivers.
module native

core :: native

Native structures for text and bytes
module numeric

core :: numeric

Advanced services for Numeric types
module protocol

core :: protocol

module queue

core :: queue

Queuing data structures and wrappers
module range

core :: range

Module for range of discrete objects.
module re

core :: re

Regular expression support for all services based on Pattern
module ropes

core :: ropes

Tree-based representation of a String.
module sax

sax :: sax

Core SAX APIs.
module sax_locator

sax :: sax_locator

Interface for associating a SAX event with a document location.
module sax_locator_impl

sax :: sax_locator_impl

Provides an optional convenience implementation of Locator.
module sax_parse_exception

sax :: sax_parse_exception

Encapsulates an XML parse error or warning.
module sorter

core :: sorter

This module contains classes used to compare things and sorts arrays.
module stream

core :: stream

Input and output streams of characters
module text

core :: text

All the classes and methods related to the manipulation of text entities
module time

core :: time

Management of time and dates
module union_find

core :: union_find

union–find algorithm using an efficient disjoint-set data structure
module utf8

core :: utf8

Codec for UTF-8 I/O
module xml_filter

sax :: xml_filter

Interface for an XML filter.
module xml_reader

sax :: xml_reader

Interface for reading an XML document using callbacks.

Parents

module reader_model

saxophonit :: reader_model

Reader’s model.

Children

module saxophonit

saxophonit :: saxophonit

A SAX 2 parser in Nit.

Descendants

module a_star-m

a_star-m

# SAXophoNit’s lexer
module saxophonit::lexer

import reader_model

# SAXophoNit’s lexer
#
# Except when noted otherwise, `accept` and `expect` functions return `true` on
# success and `false` on mismatch and at the end of the file.
# They both foward the cursor to the next byte on success, but only `expect`
# functions fire a fatal error on mismatch.
class XophonLexer

	# The model.
	var reader_model: XophonReaderModel

	# The input to read from.
	var input: Reader is writable

	# Alias to `reader_model.locator`.
	private var locator: SAXLocatorImpl is noinit

	init do
		locator = reader_model.locator.as(not null)
	end

	# Last read byte.
	#
	# Equals `-1` on end of file or error.
	private var last_char = -1

	# Before end-of-line handling, was the last read byte a CARRIAGE RETURN?
	private var was_cr: Bool = false


	# Expect a value delimiter (`"` or `'`).
	#
	# If the last read byte is a delimiter, return the delimiter and
	# read the next byte. Else, return `-1`.
	fun expect_delimiter: Int do
		if accept('"') then
			return '"'.code_point
		else if accept('\'') then
			return '\''.code_point
		else
			fire_unexpected_char(". Expecting `\"` or `'`")
			return -1
		end
	end

	# Is the last read byte matches the `Char` production?
	fun is_xml_char:Bool do
		# TODO: Handle code points above 0x7F.
		return last_char >= 32 or
				last_char == 9 or
				last_char == 10
	end

	# Push the last read byte in the specified buffer and read the next byte.
	#
	# If the last read byte is forbidden, fire a fatal error instead.
	fun expect_xml_char(buffer: Buffer): Bool do
		if is_xml_char then
			buffer.chars.push(last_char.code_point)
			read_char
			return true
		else if eof then
			return fire_fatal_error("Unexpected end of file.")
		else
			return fire_fatal_error("Forbidden character.")
		end
	end


	# Like `expect_xml_char`, but normalize white space and forbid `<`.
	#
	# SEE: The “3.3.3 Attribute-Value Normalization” section of any XML
	# recommendation.
	fun expect_att_value_char(buffer: Buffer): Bool do
		if is_s then
			buffer.chars.push(' ')
			read_char
			return true
		else if last_char == '<'.code_point then
			return fire_fatal_error("`<` is forbidden in attribute values.")
		else
			return expect_xml_char(buffer)
		end
	end

	# Is the last read byte matches the `S` production?
	fun is_s:Bool do
		return last_char == 32 or last_char == 9 or last_char == 10
	end

	# Skip a `S?` token and return `true`.
	fun skip_s: Bool do
		while is_s do read_char
		return true
	end

	# Accept a `S` token.
	fun accept_s: Bool do
		if is_s then
			read_char
			return skip_s
		else
			return false
		end
	end

	# Expect `S`.
	fun expect_s: Bool do
		return (accept_s and skip_s) or fire_unexpected_char(". Expecting white space")
	end

	# Is the last read byte matches the `NameStartChar` production?
	fun is_name_start_char: Bool do
		# TODO: Handle code points above 0x7F.
		return ['A'.code_point .. 'Z'.code_point].has(last_char) or
				['a'.code_point .. 'z'.code_point].has(last_char) or
				last_char == '_'.code_point or
				last_char == ':'.code_point or
				last_char > 127
	end

	# Is the last read byte matches the `NameChar` production?
	fun is_name_char: Bool do
		# TODO: Handle code points above 0x7F.
		return is_name_start_char or
				last_char == '-'.code_point or
				last_char == '.'.code_point or
				is_digit
	end

	# Expect a `Name` tokn.
	#
	# Append the parsed name to `buffer`.
	fun expect_name(buffer: Buffer): Bool do
		if not is_name_start_char then
			return fire_unexpected_char(" at the beginning of a name")
		end
		buffer.chars.push(last_char.code_point)
		read_char
		while is_name_char do
			buffer.chars.push(last_char.code_point)
			read_char
		end
		return true
	end

	# Expect a `PITarget` token.
	#
	# Append the parsed name to `buffer`.
	fun expect_pi_target(buffer: Buffer): Bool do
		return expect_name(buffer) and check_pi_target(buffer)
	end

	# Ensure the target is not `xml` (case-insensitive).
	#
	# Also, fire an error if the target contains a colon.
	fun check_pi_target(target: Text): Bool do
		var is_invalid = target.length == 3 and
				(target.chars[0] == 'X' or target.chars[0] == 'x') and
				(target.chars[0] == 'M' or target.chars[0] == 'm') and
				(target.chars[0] == 'L' or target.chars[0] == 'l')

		if is_invalid then
			return fire_fatal_error("Forbidden processing target `{target}`.")
		else
			if target.has(":") then
				reader_model.fire_error("The processing target `{target}` contains a colon.", null)
			end
			return true
		end
	end

	# Is the last read byte matches the `[0-9]` production?
	fun is_digit: Bool do
		return ['0'.code_point .. '9'.code_point].has(last_char)
	end

	# Accept a `[0-9]+` token.
	fun accept_digits(buffer: Buffer): Bool do
		if is_digit then
			loop
				buffer.chars.push(last_char.code_point)
				read_char
				if not is_digit then return true
			end
		else
			return false
		end
	end

	# Expect a `[0-9]+` token.
	fun expect_digits(buffer: Buffer): Bool do
		return accept_digits(buffer) or fire_unexpected_char(". Expecting a decimal digit")
	end

	# Is `last_char` matches the `[0-9a-fA-F]` production?
	fun is_hex: Bool do
		return ['0'.code_point .. '9'.code_point].has(last_char) or
				['A'.code_point .. 'Z'.code_point].has(last_char) or
				['a'.code_point .. 'Z'.code_point].has(last_char)
	end

	# Expect a `[0-9a-fA-F]+` token.
	fun expect_hex(buffer: Buffer): Bool do
		if is_hex then
			loop
				buffer.chars.push(last_char.code_point)
				read_char
				if not is_hex then return true
			end
		else
			return fire_unexpected_char(". Expecting an hexadecimal digit")
		end
	end

	# Expect `Eq`.
	fun expect_eq: Bool do
		return skip_s and expect('=', "") and skip_s
	end


	############################################################################
	# General

	# Read a byte and put it in `last_char`.
	#
	# In case of an end-of-file or an error, put -1 in `last_char`.
	private fun read_char do
		if locator.line_number < 0 then
			locator.line_number = 1
			locator.column_number = 1
		else if last_char < 0 then
			fire_fatal_error("Internal error: Already at the end of the file.")
			return
		else if last_char == '\n'.code_point then
			locator.line_number += 1
			locator.column_number = 1
		else
			locator.column_number += 1
		end

		var s = input.read_byte
		if s < 0 then
			last_char = -1
			return
		end
		last_char = s.to_i

		# XML 1.0 end-of-line handling
		# Note: Regardless the XML version, any EOL defined by the
		# recommandation MUST be reported as a single LINE FEED.
		if was_cr and last_char == '\n'.code_point then
			# EOL already reported. => Skip this byte.
			s = input.read_byte
			if s < 0 then
				last_char = -1
			else
				last_char = s
			end
		end
		was_cr = last_char == '\r'.code_point
		if was_cr then
			# Regardless the following byte, '\r' always introduce an EOL.
			last_char = '\n'.code_point
		end
	end

	# Is it the end of the stream?
	#
	# Also return `true` after a fatal error.
	fun eof: Bool do return last_char < 0

	# Start the lexer.
	fun start do
		if eof then
			last_char = 0
			read_char
		end
	end

	# Close the input.
	fun close do
		last_char = -1
		input.close
	end

	# Does the last read byte equal `c`?
	fun is_int(c: Int): Bool do return last_char == c

	# Does the last read byte equal `c`?
	fun is_char(c: Char): Bool do return last_char == c.code_point

	# Expect the specified byte.
	fun accept_int(expected: Int): Bool do
		if last_char == expected then
			read_char
			return true
		else
			return false
		end
	end

	# Accept the specified byte.
	fun accept(expected: Char): Bool do
		return accept_int(expected.code_point)
	end

	# Ensure the last read byte is equal to `expected`.
	#
	# If it is, read the next byte. If not, fire a fatal error using
	# `context`. `context` is the part of the message that gives the context.
	# For example, in `Unexpected ``x`` in y. Expecting ``z``.`, the value of
	# `context` is `" in y"`.
	#
	# Return `true` if and only if the last read byte as the expected value.
	fun expect_int(expected: Int, context: String): Bool do
		return accept_int(expected) or
				fire_unexpected_char("{context}. Expecting `{expected.code_point}`.")
	end

	# Ensure the last read byte is equal to `expected`.
	#
	# If it is, read the next byte. If not, fire a fatal error using
	# `context`. `context` is the part of the message that gives the context.
	# For example, in `Unexpected ``x`` in y. Expecting ``z``.`, the value of
	# `context` is `" in y"`.
	#
	# Return `true` if and only if the last read byte as the expected value.
	fun expect(expected: Char, context: String): Bool do
		return accept(expected) or
				fire_unexpected_char("{context}. Expecting `{expected}`.")
	end

	# Ensure the last read byte and following bytes match `expected`.
	#
	# If it is, read one more byte. If not, fire a fatal error using
	# `context`. `context` is the part of the message that gives the context.
	# For example, in `Unexpected ``x`` in y. Expecting ``z``.`, the value of
	# `context` is `" in y"`.
	#
	# Return `true` if and only if the last read byte and following bytes
	# match `expected`.
	fun expect_string(expected: String, context: String): Bool do
		var chars = expected.chars
		var i = 0

		while i < chars.length do
			if not accept(chars[i]) then
				if is_xml_char then
					return fire_fatal_error("Unexpected " +
							"`{expected.substring(0, i)}{last_char.code_point.to_s}`" +
							"{context}. Expecting `{expected}`.")
				else if eof then
					return fire_fatal_error("Unexpected end of file{context}. " +
							"Expecting `{expected}`.")
				else
					return fire_fatal_error("Forbidden character.")
				end
			end
			i += 1
		end
		return true
	end


	############################################################################
	# Dispatching

	# Fire a fatal error about an unexpected character.
	#
	# Return `false`.
	fun fire_unexpected_char(rest_of_message: String): Bool do
		if is_xml_char then
			return fire_fatal_error("Unexpected character `{last_char.code_point.to_s}`{rest_of_message}.")
		else if eof then
			return fire_fatal_error("Unexpected end of file{rest_of_message}.")
		else
			return fire_fatal_error("Forbidden character.")
		end
	end

	# Fire a fatal error with the specified message.
	#
	# Return `false`.
	private fun fire_fatal_error(message: String): Bool do
		reader_model.fire_fatal_error(message, null)
		last_char = -1
		return false
	end
end
lib/saxophonit/lexer.nit:11,1--405,3