lib/dom: Introducing simple DOM XML parser
authorLucas Bajolet <r4pass@hotmail.com>
Thu, 11 Jun 2015 21:14:06 +0000 (17:14 -0400)
committerLucas Bajolet <r4pass@hotmail.com>
Thu, 11 Jun 2015 21:14:06 +0000 (17:14 -0400)
Signed-off-by: Lucas Bajolet <r4pass@hotmail.com>

lib/dom/dom.nit [new file with mode: 0644]
lib/dom/parser.nit [new file with mode: 0644]
lib/dom/xml_entities.nit [new file with mode: 0644]

diff --git a/lib/dom/dom.nit b/lib/dom/dom.nit
new file mode 100644 (file)
index 0000000..9234057
--- /dev/null
@@ -0,0 +1,14 @@
+# This file is part of NIT ( http://www.nitlanguage.org ).
+#
+# This file is free software, which comes along with NIT.  This software is
+# distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+# without  even  the implied warranty of  MERCHANTABILITY or  FITNESS FOR A
+# PARTICULAR PURPOSE.  You can modify it is you want,  provided this header
+# is kept unaltered, and a notification of the changes is added.
+# You  are  allowed  to  redistribute it and sell it, alone or is a part of
+# another product.
+
+# Easy XML DOM parser
+module dom
+
+import parser
diff --git a/lib/dom/parser.nit b/lib/dom/parser.nit
new file mode 100644 (file)
index 0000000..a4d8d41
--- /dev/null
@@ -0,0 +1,298 @@
+# This file is part of NIT ( http://www.nitlanguage.org ).
+#
+# This file is free software, which comes along with NIT.  This software is
+# distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+# without  even  the implied warranty of  MERCHANTABILITY or  FITNESS FOR A
+# PARTICULAR PURPOSE.  You can modify it is you want,  provided this header
+# is kept unaltered, and a notification of the changes is added.
+# You  are  allowed  to  redistribute it and sell it, alone or is a part of
+# another product.
+
+# XML DOM-parsing facilities
+module parser
+
+intrude import parser_base
+intrude import xml_entities
+
+# Provides XML parsing facilities
+class XMLProcessor
+       super StringProcessor
+
+       # Parses a full XML document
+       fun parse_document: XMLEntity do
+               var stack = new Array[XMLStartTag]
+               var doc = new XMLDocument
+               loop
+                       ignore_whitespaces
+                       if pos >= src.length then break
+                       if src[pos] == '<' then
+                               var tag = read_tag
+                               if tag isa XMLStartTag then
+                                       if stack.is_empty then
+                                               tag.parent = doc
+                                       else
+                                               var st_last = stack.last
+                                               tag.parent = st_last
+                                       end
+                                       stack.push tag
+                               else if tag isa XMLEndTag then
+                                       if stack.is_empty then
+                                               return new XMLError(location = tag.location, "Missing matching tag for `{tag.tag_name}`")
+                                       end
+                                       var st_last = stack.last
+                                       if tag.tag_name == st_last.tag_name then
+                                               st_last.matching = tag
+                                               tag.matching = st_last
+                                               stack.pop
+                                       else
+                                               var miss = stack.pop
+                                               return new XMLError("Missing matching tag for `{miss.tag_name}`", location=miss.location)
+                                       end
+                               else if tag isa XMLError then
+                                       return tag
+                               else
+                                       if stack.is_empty then
+                                               tag.parent = doc
+                                       else
+                                               tag.parent = stack.last
+                                       end
+                               end
+                       else
+                               var st = pos
+                               var end_pc = ignore_until("<") - 1
+                               var pc = new PCDATA(src.substring(st, end_pc - st + 1).trim)
+                               if stack.is_empty then
+                                       pc.parent = doc
+                               else
+                                       pc.parent = stack.last
+                               end
+                       end
+               end
+               if not stack.is_empty then
+                       var miss = stack.pop
+                       return new XMLError("Missing matching tag for `{miss.tag_name}`", location=miss.location)
+               end
+               return doc
+       end
+
+       # Reads the tag starting in `src` at current position
+       private fun read_tag: XMLEntity do
+               var st_loc = new Location(line, line_offset)
+               var c = src[pos]
+               if not c == '<' then return new XMLError(location=st_loc, "Expected start of tag, got `{c}`")
+               var st = pos
+               pos += 1
+               c = src[pos]
+               if c == '!' then
+                       # Special tag
+                       return read_special_tag(st_loc)
+               else if c == '?' then
+                       # Prolog tag
+                       return read_prolog_tag(st_loc)
+               else if c == '/' then
+                       # End tag
+                       return read_end_tag(st_loc)
+               else
+                       # Start tag
+                       return read_start_tag(st_loc)
+               end
+       end
+
+       # Reads a Special tag (starting with <!)
+       #
+       # In case of error, returns a `XMLError`
+       private fun read_special_tag(st_loc: Location): XMLEntity do
+               var srclen = src.length
+               pos += 1
+               if (pos + 2) >= srclen then return new XMLError(location=st_loc, "Unexpected EOF on start of Special tag")
+               if src[pos] == '-' and src[pos + 1] == '-' then
+                       pos += 2
+                       var comst = pos
+                       var endcom = ignore_until("-->")
+                       if endcom == -1 then return new XMLError(location=st_loc, "Malformatted comment")
+                       pos += 3
+                       return new XMLCommentTag(location=st_loc ,src.substring(comst, endcom - comst + 1))
+               end
+               var st = pos
+               if srclen - pos >= 7 then
+                       var spe_type = src.substring(pos, 7)
+                       if spe_type == "[CDATA[" then
+                               pos += 7
+                               var cdst = pos
+                               var cdend = ignore_until("]]>")
+                               pos += 3
+                               if pos >= srclen then return new XMLError(location = st_loc, "Unfinished CDATA block")
+                               return new CDATA(src.substring(cdst, cdend - cdst))
+                       else if spe_type == "DOCTYPE" then
+                               pos += 7
+                               return parse_doctype(st_loc)
+                       end
+               end
+               var end_spec = ignore_until(">")
+               pos += 1
+               return new XMLSpecialTag(location=st_loc, src.substring(st, end_spec - st))
+       end
+
+       # Parse a Doctype declaration tag
+       private fun parse_doctype(st_loc: Location): XMLEntity do
+               var elemts = new Array[String]
+               var srclen = src.length
+               loop
+                       ignore_whitespaces
+                       if pos >= srclen then return new XMLError(location = st_loc, "Malformatted doctype")
+                       var c = src[pos]
+                       # TODO: Properly support intern DOCTYPE definitions
+                       if c == '[' then
+                               var intern_st = pos
+                               var intern_end = ignore_until("]")
+                               if intern_end == -1 then return new XMLError(location = st_loc, "Unfinished internal doctype declaration")
+                               pos += 1
+                               elemts.push src.substring(intern_st, intern_end - intern_st + 1)
+                               continue
+                       end
+                       var elm_st = pos
+                       while pos < srclen and not src[pos].is_whitespace and src[pos] != '>' do pos += 1
+                       if pos >= srclen then return new XMLError(location = st_loc, "Malformatted doctype")
+                       if pos - elm_st > 1 then
+                               var str = src.substring(elm_st, pos - elm_st)
+                               elemts.push str
+                       end
+                       if src[pos] == '>' then
+                               pos += 1
+                               return new XMLDoctypeTag(location = st_loc, "DOCTYPE", elemts.join(" "))
+                       end
+               end
+       end
+
+       # Reads a Prolog or Processing Instruction tag (starting with <?)
+       #
+       # In case of error, returns a `XMLError`
+       private fun read_prolog_tag(st_loc: Location): XMLEntity do
+               var srclen = src.length
+               pos += 1
+               if pos >= srclen then return new XMLError(location=st_loc, "Invalid start of prolog")
+               var idst = pos
+               var tag_name = parse_tag_name(['<', '>'])
+               var c = src[pos]
+               if c == '<' or c == '>' then return new XMLError(location=st_loc ,"Unexpected character `{c}` in prolog declaration")
+               if tag_name == "xml" then
+                       var args = parse_args(['?'])
+                       for i in args do
+                               if i isa BadXMLAttribute then return new XMLError(location = i.location, i.name)
+                       end
+                       if src[pos] == '?' then
+                               if src[pos + 1] == '>' then
+                                       pos += 2
+                                       return new XMLPrologTag(location=st_loc, tag_name, args)
+                               end
+                       end
+               else
+                       if tag_name.has("xml") then return new XMLError(location = st_loc, "Forbidden keyword xml in Processing Instruction")
+                       var cont_st = pos
+                       var cont_end = ignore_until("?>")
+                       if cont_end == -1 then
+                               pos += 2
+                               return new XMLError(location = st_loc, "Malformatted Processing Instruction tag")
+                       end
+                       pos += 2
+                       return new XMLProcessingInstructionTag(location=st_loc, tag_name, src.substring(cont_st, cont_end - cont_st))
+               end
+               pos += 1
+               return new XMLError(location=st_loc, "Malformatted prolog tag")
+       end
+
+       # Reads an End tag (starting with </)
+       #
+       # In case of error, returns a `XMLError`
+       private fun read_end_tag(st_loc: Location): XMLEntity do
+               var srclen = src.length
+               pos += 1
+               var tag_name = parse_tag_name(['<', '>'])
+               ignore_whitespaces
+               if src[pos] == '>' then
+                       pos += 1
+                       return new XMLEndTag(location=st_loc, tag_name)
+               end
+               return new XMLError(location = st_loc, "Bad end tag `{tag_name}`")
+       end
+
+       # Reads a Start tag (starting with <)
+       #
+       # In case of error, returns a `XMLError`
+       private fun read_start_tag(st_loc: Location): XMLEntity do
+               var srclen = src.length
+               var tag_name = parse_tag_name(['/', '>'])
+               var args = parse_args(['/', '>'])
+               for i in args do
+                       if i isa BadXMLAttribute then return new XMLError(location=i.location, i.name)
+               end
+               if src[pos] == '/' then
+                       if src[pos + 1] == '>' then
+                               pos += 2
+                               return new XMLOnelinerTag(location=st_loc, tag_name, args)
+                       end
+               end
+               pos += 1
+               return new XMLStartTag(location=st_loc, tag_name, args)
+       end
+
+       # Parses an xml tag name
+       private fun parse_tag_name(delims: Array[Char]): String do
+               var idst = pos
+               var c = src[pos]
+               var srclen = src.length
+               while pos < srclen and not c.is_whitespace and not delims.has(c) do
+                       pos += 1
+                       c = src[pos]
+               end
+               return src.substring(idst, pos - idst).trim
+       end
+
+       # Parse the arguments of a tag
+       private fun parse_args(endtags: Array[Char]): Array[XMLAttribute] do
+               var attrs = new Array[XMLAttribute]
+               loop
+                       var arg = parse_arg(endtags)
+                       if arg isa XMLAttributeEnd then return attrs
+                       attrs.add arg
+                       if arg isa BadXMLAttribute then return attrs
+               end
+       end
+
+       # Parses the next argument in `src`
+       private fun parse_arg(endtags: Array[Char]): XMLAttribute do
+               var srclen = src.length
+               var attr: XMLAttribute
+               ignore_whitespaces
+               var st_loc = new Location(line, line_offset)
+               if pos >= srclen then return new BadXMLAttribute(location = st_loc, "Unfinished attribute name")
+               # FIXME: Ugly, but as long as it remains private, it is OK I guess
+               if endtags.has(src[pos]) then return new XMLAttributeEnd("")
+               var attrname_st = pos
+               while pos < srclen and src[pos] != '=' and not endtags.has(src[pos]) do pos += 1
+               if pos >= srclen then return new BadXMLAttribute(location = st_loc, "Unfinished attribute name")
+               if src[pos] != '=' then return new BadXMLAttribute(location = st_loc, "Malformatted attribute")
+               var attrname_end = pos - 1
+               var name = src.substring(attrname_st, attrname_end - attrname_st + 1).trim
+               pos += 1
+               ignore_whitespaces
+               var attrval_st = pos
+               if pos >= srclen then return new BadXMLAttribute(location=st_loc, "Unfinished attribute `{name}`")
+               var match = src[pos]
+               if match != '\'' and match != '"' then return new BadXMLAttribute(location=st_loc, "Invalid string delimiter `{match}` for attribute `{name}`")
+               pos += 1
+               while pos < srclen and src[pos] != match do pos += 1
+               if pos >= srclen then return new BadXMLAttribute(location=st_loc, "Unfinished attribute `{name}`")
+               var attrval_end = pos
+               var val = src.substring(attrval_st, attrval_end - attrval_st + 1).trim
+               pos += 1
+               return new XMLStringAttr(location=st_loc, name, val.substring(1, val.length - 2), match)
+       end
+end
+
+redef class Text
+       # Tries to parse the current string to XML
+       #
+       # Returns an `XMLDocument` if successful, or an `XMLError` if not
+       fun to_xml: XMLEntity do return (new XMLProcessor(self.to_s)).parse_document
+end
diff --git a/lib/dom/xml_entities.nit b/lib/dom/xml_entities.nit
new file mode 100644 (file)
index 0000000..ef080d4
--- /dev/null
@@ -0,0 +1,298 @@
+# This file is part of NIT ( http://www.nitlanguage.org ).
+#
+# This file is free software, which comes along with NIT.  This software is
+# distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+# without  even  the implied warranty of  MERCHANTABILITY or  FITNESS FOR A
+# PARTICULAR PURPOSE.  You can modify it is you want,  provided this header
+# is kept unaltered, and a notification of the changes is added.
+# You  are  allowed  to  redistribute it and sell it, alone or is a part of
+# another product.
+
+# Basic blocks for DOM-XML representation
+#
+# DOM entities are defined in this module, specifically:
+#
+# * `XMLEntity`: Abstract kind of XML-related node
+# * `XMLDocument`: A well-formed XML document, root of the tree
+# * `PCDATA`: Raw XML-escaped character data
+# * `CDATA`: Raw data, may contain invalid XML escape characters
+# * `XMLTag`: Abstract XML tag element
+# * `XMLAttrTag`: Abstract XML element, they may contain attributes
+# * `XMLOnelinerTag`: Any tag contained on one-line only
+# * `XMLStartTag`: A tag starting a new hierarchy level in the tree
+# * `XMLPrologTag`: A tag containing meta-information on the document, must start with <?xml
+# * `XMLProcessingInstructionTag`: Any XML tag starting with <? other than the prolog tag
+# * `XMLEndTag`: A tag signaling the end of a block
+# * `XMLCommentTag`: A comment tag
+# * `XMLSpecialTag`: A special tag, which may contain meta-information
+# * `XMLDoctypeTag`: A DOCTYPE tag, use to register a DTD
+# * `XMLAttribute`: Any kind of attribute that may be attached to a tag
+# * `XMLStringAttr`: An attribute containing a String
+# * `XMLError`: Any kind of error thrown while parsing a document
+module xml_entities
+
+import parser_base
+
+# Any kind of XML Entity
+abstract class XMLEntity
+       # Optional parent of `self`
+       var parent: nullable XMLEntity is private writable(set_parent)
+
+       # Optional location of the entity in source
+       var location: nullable Location
+
+       # The children of `self`
+       var children: Sequence[XMLEntity] = new XMLEntities(self)
+
+       # Sets the parent of `self` to `e`
+       fun parent=(e: XMLEntity) do
+               if parent != null then
+                       parent.children.remove(self)
+               end
+               e.children.add(self)
+       end
+end
+
+# Proxy collection of XMLEntities, ordered, used for the children of an entity
+private class XMLEntities
+       super Sequence[XMLEntity]
+
+       # The owner, aka, the parent
+       var owner: XMLEntity
+
+       private var entities = new List[XMLEntity]
+
+       redef fun length do return entities.length
+
+       redef fun [](i) do return entities[i]
+
+       redef fun []=(index, el) do
+               var olde = self[index]
+               if olde.parent != null then
+                       olde.parent.children.remove(el)
+               end
+               entities[index] = el
+               el.set_parent owner
+       end
+
+       redef fun push(e) do
+               if not entities.has(e) then
+                       entities.add e
+                       e.parent = owner
+               end
+       end
+
+
+       redef fun remove(e) do
+               if e isa XMLEntity then
+                       e.set_parent null
+                       entities.remove(e)
+               end
+       end
+
+       redef fun has(e) do return entities.has(e)
+
+       redef fun iterator do return entities.iterator
+
+       redef fun reverse_iterator do return entities.reverse_iterator
+
+       redef fun pop do
+               var e = entities.pop
+               e.set_parent null
+               return e
+       end
+
+       redef fun unshift(e) do
+               entities.unshift e
+               e.set_parent owner
+       end
+
+       redef fun shift do
+               var e = entities.shift
+               e.set_parent null
+               return e
+       end
+
+       redef fun insert(it, index) do
+               entities.insert(it, index)
+               it.set_parent owner
+       end
+
+       redef fun remove_at(ind) do
+               var el = entities[ind]
+               entities.remove_at(ind)
+               el.set_parent null
+       end
+end
+
+# Top XML Document-Object Model element
+class XMLDocument
+       super XMLEntity
+
+       redef fun to_s do return children.join("")
+end
+
+# PCDATA is any kind of non-xml formatted text
+class PCDATA
+       super XMLEntity
+
+       # Any string containing non XML-reserved characters
+       var content: String
+
+       redef fun to_s do return content
+end
+
+# CDATA are regions in which no xml entity is parsed, all is ignored
+class CDATA
+       super XMLEntity
+
+       # Any string contained within a CDATA block, may contain XML-reserved characters
+       var content: String
+
+       redef fun to_s do return "<![CDATA[{content}]]>"
+end
+
+# A Tag is a node in a DOM tree
+abstract class XMLTag
+       super XMLEntity
+
+       # The name of the tag
+       var tag_name: String
+end
+
+# Any kind of XML tag with attributes
+abstract class XMLAttrTag
+       super XMLTag
+
+       # List of attributes in a Tag
+       var attributes: Array[XMLAttribute]
+end
+
+# One-liner XML Tag (Ends with />)
+class XMLOnelinerTag
+       super XMLAttrTag
+
+       redef fun to_s do
+               var s = "<{tag_name}"
+               if not attributes.is_empty then
+                       s += " "
+                       s += attributes.join(" ")
+               end
+               s += "/>"
+               return s
+       end
+end
+
+# A (potentially) multi-line spanning XML Tag start
+class XMLStartTag
+       super XMLAttrTag
+
+       # Optional matching tag, must be matched for the document to be well-formed
+       var matching: nullable XMLEndTag
+
+       redef fun to_s do
+               var s = "<{tag_name}"
+               if not attributes.is_empty then
+                       s += " "
+                       s += attributes.join(" ")
+               end
+               s += ">"
+               for i in children do s += i.to_s
+               if matching != null then s += matching.to_s
+               return s
+       end
+end
+
+# Any prolog style-Tag (starting with <?xml)
+class XMLPrologTag
+       super XMLAttrTag
+
+       redef fun to_s do return """<?{{{tag_name}}} {{{attributes.join(" ")}}}?>"""
+end
+
+# Processing instructions start with <? and are to be read by a third-party application
+class XMLProcessingInstructionTag
+       super XMLTag
+
+       var content: String
+
+       redef fun to_s do return "<?{tag_name} {content}?>"
+end
+
+# An end Tag (starting with </)
+class XMLEndTag
+       super XMLTag
+
+       # Optional matching tag, must be matched for the document to be well-formed
+       var matching: nullable XMLStartTag
+
+       redef fun to_s do return "</{tag_name}>"
+end
+
+# An XML comment tag
+class XMLCommentTag
+       super XMLTag
+
+       redef fun to_s do return "<!--{tag_name}-->"
+end
+
+# A DOCTYPE Tag
+class XMLDoctypeTag
+       super XMLTag
+
+       var content: String
+
+       redef fun to_s do return "<!DOCTYPE {content}>"
+end
+
+# A Special Tag (starts with !)
+#
+# TODO: Support the remaining ! tags
+class XMLSpecialTag
+       super XMLTag
+
+       redef fun to_s do return "<!{tag_name}>"
+end
+
+# Attributes are contained in tags, they provide meta-information on a tag
+abstract class XMLAttribute
+       super XMLEntity
+
+       # Name of the attribute
+       var name: String
+end
+
+# An attribute with a String value
+class XMLStringAttr
+       super XMLAttribute
+
+       # Value of the attribute without the double quotes
+       var value: String
+
+       # Type of delimiter (can be either " or ')
+       var delimiter: Char
+
+       redef fun to_s do return "{name}={delimiter}{value}{delimiter}"
+end
+
+# Badly formed XML attribute
+class BadXMLAttribute
+       super XMLAttribute
+
+       redef fun to_s do return name
+end
+
+# Internal use only, shows the end of an attribute block
+private class XMLAttributeEnd
+       super XMLAttribute
+end
+
+# Any XML Error that happens when parsing
+class XMLError
+       super XMLEntity
+
+       # Error message reported by the parser
+       var message: String
+
+       redef fun to_s do return "XML Error: {message} at {location.to_s}"
+end