--- /dev/null
+# This file is part of NIT ( http://www.nitlanguage.org ).
+#
+# This file is free software, which comes along with NIT. This software is
+# distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+# without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE. You can modify it is you want, provided this header
+# is kept unaltered, and a notification of the changes is added.
+# You are allowed to redistribute it and sell it, alone or is a part of
+# another product.
+
+# XML DOM-parsing facilities
+module parser
+
+intrude import parser_base
+intrude import xml_entities
+
+# Provides XML parsing facilities
+class XMLProcessor
+ super StringProcessor
+
+ # Parses a full XML document
+ fun parse_document: XMLEntity do
+ var stack = new Array[XMLStartTag]
+ var doc = new XMLDocument
+ loop
+ ignore_whitespaces
+ if pos >= src.length then break
+ if src[pos] == '<' then
+ var tag = read_tag
+ if tag isa XMLStartTag then
+ if stack.is_empty then
+ tag.parent = doc
+ else
+ var st_last = stack.last
+ tag.parent = st_last
+ end
+ stack.push tag
+ else if tag isa XMLEndTag then
+ if stack.is_empty then
+ return new XMLError(location = tag.location, "Missing matching tag for `{tag.tag_name}`")
+ end
+ var st_last = stack.last
+ if tag.tag_name == st_last.tag_name then
+ st_last.matching = tag
+ tag.matching = st_last
+ stack.pop
+ else
+ var miss = stack.pop
+ return new XMLError("Missing matching tag for `{miss.tag_name}`", location=miss.location)
+ end
+ else if tag isa XMLError then
+ return tag
+ else
+ if stack.is_empty then
+ tag.parent = doc
+ else
+ tag.parent = stack.last
+ end
+ end
+ else
+ var st = pos
+ var end_pc = ignore_until("<") - 1
+ var pc = new PCDATA(src.substring(st, end_pc - st + 1).trim)
+ if stack.is_empty then
+ pc.parent = doc
+ else
+ pc.parent = stack.last
+ end
+ end
+ end
+ if not stack.is_empty then
+ var miss = stack.pop
+ return new XMLError("Missing matching tag for `{miss.tag_name}`", location=miss.location)
+ end
+ return doc
+ end
+
+ # Reads the tag starting in `src` at current position
+ private fun read_tag: XMLEntity do
+ var st_loc = new Location(line, line_offset)
+ var c = src[pos]
+ if not c == '<' then return new XMLError(location=st_loc, "Expected start of tag, got `{c}`")
+ var st = pos
+ pos += 1
+ c = src[pos]
+ if c == '!' then
+ # Special tag
+ return read_special_tag(st_loc)
+ else if c == '?' then
+ # Prolog tag
+ return read_prolog_tag(st_loc)
+ else if c == '/' then
+ # End tag
+ return read_end_tag(st_loc)
+ else
+ # Start tag
+ return read_start_tag(st_loc)
+ end
+ end
+
+ # Reads a Special tag (starting with <!)
+ #
+ # In case of error, returns a `XMLError`
+ private fun read_special_tag(st_loc: Location): XMLEntity do
+ var srclen = src.length
+ pos += 1
+ if (pos + 2) >= srclen then return new XMLError(location=st_loc, "Unexpected EOF on start of Special tag")
+ if src[pos] == '-' and src[pos + 1] == '-' then
+ pos += 2
+ var comst = pos
+ var endcom = ignore_until("-->")
+ if endcom == -1 then return new XMLError(location=st_loc, "Malformatted comment")
+ pos += 3
+ return new XMLCommentTag(location=st_loc ,src.substring(comst, endcom - comst + 1))
+ end
+ var st = pos
+ if srclen - pos >= 7 then
+ var spe_type = src.substring(pos, 7)
+ if spe_type == "[CDATA[" then
+ pos += 7
+ var cdst = pos
+ var cdend = ignore_until("]]>")
+ pos += 3
+ if pos >= srclen then return new XMLError(location = st_loc, "Unfinished CDATA block")
+ return new CDATA(src.substring(cdst, cdend - cdst))
+ else if spe_type == "DOCTYPE" then
+ pos += 7
+ return parse_doctype(st_loc)
+ end
+ end
+ var end_spec = ignore_until(">")
+ pos += 1
+ return new XMLSpecialTag(location=st_loc, src.substring(st, end_spec - st))
+ end
+
+ # Parse a Doctype declaration tag
+ private fun parse_doctype(st_loc: Location): XMLEntity do
+ var elemts = new Array[String]
+ var srclen = src.length
+ loop
+ ignore_whitespaces
+ if pos >= srclen then return new XMLError(location = st_loc, "Malformatted doctype")
+ var c = src[pos]
+ # TODO: Properly support intern DOCTYPE definitions
+ if c == '[' then
+ var intern_st = pos
+ var intern_end = ignore_until("]")
+ if intern_end == -1 then return new XMLError(location = st_loc, "Unfinished internal doctype declaration")
+ pos += 1
+ elemts.push src.substring(intern_st, intern_end - intern_st + 1)
+ continue
+ end
+ var elm_st = pos
+ while pos < srclen and not src[pos].is_whitespace and src[pos] != '>' do pos += 1
+ if pos >= srclen then return new XMLError(location = st_loc, "Malformatted doctype")
+ if pos - elm_st > 1 then
+ var str = src.substring(elm_st, pos - elm_st)
+ elemts.push str
+ end
+ if src[pos] == '>' then
+ pos += 1
+ return new XMLDoctypeTag(location = st_loc, "DOCTYPE", elemts.join(" "))
+ end
+ end
+ end
+
+ # Reads a Prolog or Processing Instruction tag (starting with <?)
+ #
+ # In case of error, returns a `XMLError`
+ private fun read_prolog_tag(st_loc: Location): XMLEntity do
+ var srclen = src.length
+ pos += 1
+ if pos >= srclen then return new XMLError(location=st_loc, "Invalid start of prolog")
+ var idst = pos
+ var tag_name = parse_tag_name(['<', '>'])
+ var c = src[pos]
+ if c == '<' or c == '>' then return new XMLError(location=st_loc ,"Unexpected character `{c}` in prolog declaration")
+ if tag_name == "xml" then
+ var args = parse_args(['?'])
+ for i in args do
+ if i isa BadXMLAttribute then return new XMLError(location = i.location, i.name)
+ end
+ if src[pos] == '?' then
+ if src[pos + 1] == '>' then
+ pos += 2
+ return new XMLPrologTag(location=st_loc, tag_name, args)
+ end
+ end
+ else
+ if tag_name.has("xml") then return new XMLError(location = st_loc, "Forbidden keyword xml in Processing Instruction")
+ var cont_st = pos
+ var cont_end = ignore_until("?>")
+ if cont_end == -1 then
+ pos += 2
+ return new XMLError(location = st_loc, "Malformatted Processing Instruction tag")
+ end
+ pos += 2
+ return new XMLProcessingInstructionTag(location=st_loc, tag_name, src.substring(cont_st, cont_end - cont_st))
+ end
+ pos += 1
+ return new XMLError(location=st_loc, "Malformatted prolog tag")
+ end
+
+ # Reads an End tag (starting with </)
+ #
+ # In case of error, returns a `XMLError`
+ private fun read_end_tag(st_loc: Location): XMLEntity do
+ var srclen = src.length
+ pos += 1
+ var tag_name = parse_tag_name(['<', '>'])
+ ignore_whitespaces
+ if src[pos] == '>' then
+ pos += 1
+ return new XMLEndTag(location=st_loc, tag_name)
+ end
+ return new XMLError(location = st_loc, "Bad end tag `{tag_name}`")
+ end
+
+ # Reads a Start tag (starting with <)
+ #
+ # In case of error, returns a `XMLError`
+ private fun read_start_tag(st_loc: Location): XMLEntity do
+ var srclen = src.length
+ var tag_name = parse_tag_name(['/', '>'])
+ var args = parse_args(['/', '>'])
+ for i in args do
+ if i isa BadXMLAttribute then return new XMLError(location=i.location, i.name)
+ end
+ if src[pos] == '/' then
+ if src[pos + 1] == '>' then
+ pos += 2
+ return new XMLOnelinerTag(location=st_loc, tag_name, args)
+ end
+ end
+ pos += 1
+ return new XMLStartTag(location=st_loc, tag_name, args)
+ end
+
+ # Parses an xml tag name
+ private fun parse_tag_name(delims: Array[Char]): String do
+ var idst = pos
+ var c = src[pos]
+ var srclen = src.length
+ while pos < srclen and not c.is_whitespace and not delims.has(c) do
+ pos += 1
+ c = src[pos]
+ end
+ return src.substring(idst, pos - idst).trim
+ end
+
+ # Parse the arguments of a tag
+ private fun parse_args(endtags: Array[Char]): Array[XMLAttribute] do
+ var attrs = new Array[XMLAttribute]
+ loop
+ var arg = parse_arg(endtags)
+ if arg isa XMLAttributeEnd then return attrs
+ attrs.add arg
+ if arg isa BadXMLAttribute then return attrs
+ end
+ end
+
+ # Parses the next argument in `src`
+ private fun parse_arg(endtags: Array[Char]): XMLAttribute do
+ var srclen = src.length
+ var attr: XMLAttribute
+ ignore_whitespaces
+ var st_loc = new Location(line, line_offset)
+ if pos >= srclen then return new BadXMLAttribute(location = st_loc, "Unfinished attribute name")
+ # FIXME: Ugly, but as long as it remains private, it is OK I guess
+ if endtags.has(src[pos]) then return new XMLAttributeEnd("")
+ var attrname_st = pos
+ while pos < srclen and src[pos] != '=' and not endtags.has(src[pos]) do pos += 1
+ if pos >= srclen then return new BadXMLAttribute(location = st_loc, "Unfinished attribute name")
+ if src[pos] != '=' then return new BadXMLAttribute(location = st_loc, "Malformatted attribute")
+ var attrname_end = pos - 1
+ var name = src.substring(attrname_st, attrname_end - attrname_st + 1).trim
+ pos += 1
+ ignore_whitespaces
+ var attrval_st = pos
+ if pos >= srclen then return new BadXMLAttribute(location=st_loc, "Unfinished attribute `{name}`")
+ var match = src[pos]
+ if match != '\'' and match != '"' then return new BadXMLAttribute(location=st_loc, "Invalid string delimiter `{match}` for attribute `{name}`")
+ pos += 1
+ while pos < srclen and src[pos] != match do pos += 1
+ if pos >= srclen then return new BadXMLAttribute(location=st_loc, "Unfinished attribute `{name}`")
+ var attrval_end = pos
+ var val = src.substring(attrval_st, attrval_end - attrval_st + 1).trim
+ pos += 1
+ return new XMLStringAttr(location=st_loc, name, val.substring(1, val.length - 2), match)
+ end
+end
+
+redef class Text
+ # Tries to parse the current string to XML
+ #
+ # Returns an `XMLDocument` if successful, or an `XMLError` if not
+ fun to_xml: XMLEntity do return (new XMLProcessor(self.to_s)).parse_document
+end
--- /dev/null
+# This file is part of NIT ( http://www.nitlanguage.org ).
+#
+# This file is free software, which comes along with NIT. This software is
+# distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+# without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE. You can modify it is you want, provided this header
+# is kept unaltered, and a notification of the changes is added.
+# You are allowed to redistribute it and sell it, alone or is a part of
+# another product.
+
+# Basic blocks for DOM-XML representation
+#
+# DOM entities are defined in this module, specifically:
+#
+# * `XMLEntity`: Abstract kind of XML-related node
+# * `XMLDocument`: A well-formed XML document, root of the tree
+# * `PCDATA`: Raw XML-escaped character data
+# * `CDATA`: Raw data, may contain invalid XML escape characters
+# * `XMLTag`: Abstract XML tag element
+# * `XMLAttrTag`: Abstract XML element, they may contain attributes
+# * `XMLOnelinerTag`: Any tag contained on one-line only
+# * `XMLStartTag`: A tag starting a new hierarchy level in the tree
+# * `XMLPrologTag`: A tag containing meta-information on the document, must start with <?xml
+# * `XMLProcessingInstructionTag`: Any XML tag starting with <? other than the prolog tag
+# * `XMLEndTag`: A tag signaling the end of a block
+# * `XMLCommentTag`: A comment tag
+# * `XMLSpecialTag`: A special tag, which may contain meta-information
+# * `XMLDoctypeTag`: A DOCTYPE tag, use to register a DTD
+# * `XMLAttribute`: Any kind of attribute that may be attached to a tag
+# * `XMLStringAttr`: An attribute containing a String
+# * `XMLError`: Any kind of error thrown while parsing a document
+module xml_entities
+
+import parser_base
+
+# Any kind of XML Entity
+abstract class XMLEntity
+ # Optional parent of `self`
+ var parent: nullable XMLEntity is private writable(set_parent)
+
+ # Optional location of the entity in source
+ var location: nullable Location
+
+ # The children of `self`
+ var children: Sequence[XMLEntity] = new XMLEntities(self)
+
+ # Sets the parent of `self` to `e`
+ fun parent=(e: XMLEntity) do
+ if parent != null then
+ parent.children.remove(self)
+ end
+ e.children.add(self)
+ end
+end
+
+# Proxy collection of XMLEntities, ordered, used for the children of an entity
+private class XMLEntities
+ super Sequence[XMLEntity]
+
+ # The owner, aka, the parent
+ var owner: XMLEntity
+
+ private var entities = new List[XMLEntity]
+
+ redef fun length do return entities.length
+
+ redef fun [](i) do return entities[i]
+
+ redef fun []=(index, el) do
+ var olde = self[index]
+ if olde.parent != null then
+ olde.parent.children.remove(el)
+ end
+ entities[index] = el
+ el.set_parent owner
+ end
+
+ redef fun push(e) do
+ if not entities.has(e) then
+ entities.add e
+ e.parent = owner
+ end
+ end
+
+
+ redef fun remove(e) do
+ if e isa XMLEntity then
+ e.set_parent null
+ entities.remove(e)
+ end
+ end
+
+ redef fun has(e) do return entities.has(e)
+
+ redef fun iterator do return entities.iterator
+
+ redef fun reverse_iterator do return entities.reverse_iterator
+
+ redef fun pop do
+ var e = entities.pop
+ e.set_parent null
+ return e
+ end
+
+ redef fun unshift(e) do
+ entities.unshift e
+ e.set_parent owner
+ end
+
+ redef fun shift do
+ var e = entities.shift
+ e.set_parent null
+ return e
+ end
+
+ redef fun insert(it, index) do
+ entities.insert(it, index)
+ it.set_parent owner
+ end
+
+ redef fun remove_at(ind) do
+ var el = entities[ind]
+ entities.remove_at(ind)
+ el.set_parent null
+ end
+end
+
+# Top XML Document-Object Model element
+class XMLDocument
+ super XMLEntity
+
+ redef fun to_s do return children.join("")
+end
+
+# PCDATA is any kind of non-xml formatted text
+class PCDATA
+ super XMLEntity
+
+ # Any string containing non XML-reserved characters
+ var content: String
+
+ redef fun to_s do return content
+end
+
+# CDATA are regions in which no xml entity is parsed, all is ignored
+class CDATA
+ super XMLEntity
+
+ # Any string contained within a CDATA block, may contain XML-reserved characters
+ var content: String
+
+ redef fun to_s do return "<![CDATA[{content}]]>"
+end
+
+# A Tag is a node in a DOM tree
+abstract class XMLTag
+ super XMLEntity
+
+ # The name of the tag
+ var tag_name: String
+end
+
+# Any kind of XML tag with attributes
+abstract class XMLAttrTag
+ super XMLTag
+
+ # List of attributes in a Tag
+ var attributes: Array[XMLAttribute]
+end
+
+# One-liner XML Tag (Ends with />)
+class XMLOnelinerTag
+ super XMLAttrTag
+
+ redef fun to_s do
+ var s = "<{tag_name}"
+ if not attributes.is_empty then
+ s += " "
+ s += attributes.join(" ")
+ end
+ s += "/>"
+ return s
+ end
+end
+
+# A (potentially) multi-line spanning XML Tag start
+class XMLStartTag
+ super XMLAttrTag
+
+ # Optional matching tag, must be matched for the document to be well-formed
+ var matching: nullable XMLEndTag
+
+ redef fun to_s do
+ var s = "<{tag_name}"
+ if not attributes.is_empty then
+ s += " "
+ s += attributes.join(" ")
+ end
+ s += ">"
+ for i in children do s += i.to_s
+ if matching != null then s += matching.to_s
+ return s
+ end
+end
+
+# Any prolog style-Tag (starting with <?xml)
+class XMLPrologTag
+ super XMLAttrTag
+
+ redef fun to_s do return """<?{{{tag_name}}} {{{attributes.join(" ")}}}?>"""
+end
+
+# Processing instructions start with <? and are to be read by a third-party application
+class XMLProcessingInstructionTag
+ super XMLTag
+
+ var content: String
+
+ redef fun to_s do return "<?{tag_name} {content}?>"
+end
+
+# An end Tag (starting with </)
+class XMLEndTag
+ super XMLTag
+
+ # Optional matching tag, must be matched for the document to be well-formed
+ var matching: nullable XMLStartTag
+
+ redef fun to_s do return "</{tag_name}>"
+end
+
+# An XML comment tag
+class XMLCommentTag
+ super XMLTag
+
+ redef fun to_s do return "<!--{tag_name}-->"
+end
+
+# A DOCTYPE Tag
+class XMLDoctypeTag
+ super XMLTag
+
+ var content: String
+
+ redef fun to_s do return "<!DOCTYPE {content}>"
+end
+
+# A Special Tag (starts with !)
+#
+# TODO: Support the remaining ! tags
+class XMLSpecialTag
+ super XMLTag
+
+ redef fun to_s do return "<!{tag_name}>"
+end
+
+# Attributes are contained in tags, they provide meta-information on a tag
+abstract class XMLAttribute
+ super XMLEntity
+
+ # Name of the attribute
+ var name: String
+end
+
+# An attribute with a String value
+class XMLStringAttr
+ super XMLAttribute
+
+ # Value of the attribute without the double quotes
+ var value: String
+
+ # Type of delimiter (can be either " or ')
+ var delimiter: Char
+
+ redef fun to_s do return "{name}={delimiter}{value}{delimiter}"
+end
+
+# Badly formed XML attribute
+class BadXMLAttribute
+ super XMLAttribute
+
+ redef fun to_s do return name
+end
+
+# Internal use only, shows the end of an attribute block
+private class XMLAttributeEnd
+ super XMLAttribute
+end
+
+# Any XML Error that happens when parsing
+class XMLError
+ super XMLEntity
+
+ # Error message reported by the parser
+ var message: String
+
+ redef fun to_s do return "XML Error: {message} at {location.to_s}"
+end