lib/dom/parser.nit

   1 # This file is part of NIT ( http://www.nitlanguage.org ).
   2 #
   3 # This file is free software, which comes along with NIT.  This software is
   4 # distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
   5 # without  even  the implied warranty of  MERCHANTABILITY or  FITNESS FOR A
   6 # PARTICULAR PURPOSE.  You can modify it is you want,  provided this header
   7 # is kept unaltered, and a notification of the changes is added.
   8 # You  are  allowed  to  redistribute it and sell it, alone or is a part of
   9 # another product.
  10
  11 # XML DOM-parsing facilities
  12 module parser
  13
  14 intrude import parser_base
  15 intrude import xml_entities
  16
  17 # Provides XML parsing facilities
  18 class XMLProcessor
  19         super StringProcessor
  20
  21         # Parses a full XML document
  22         fun parse_document: XMLEntity do
  23                 var stack = new Array[XMLStartTag]
  24                 var doc = new XMLDocument
  25                 loop
  26                         ignore_whitespaces
  27                         if pos >= src.length then break
  28                         if src[pos] == '<' then
  29                                 var tag = read_tag
  30                                 if tag isa XMLStartTag then
  31                                         if stack.is_empty then
  32                                                 tag.parent = doc
  33                                         else
  34                                                 var st_last = stack.last
  35                                                 tag.parent = st_last
  36                                         end
  37                                         stack.push tag
  38                                 else if tag isa XMLEndTag then
  39                                         if stack.is_empty then
  40                                                 return new XMLError(location = tag.location, "Missing matching tag for `{tag.tag_name}`")
  41                                         end
  42                                         var st_last = stack.last
  43                                         if tag.tag_name == st_last.tag_name then
  44                                                 st_last.matching = tag
  45                                                 tag.matching = st_last
  46                                                 stack.pop
  47                                         else
  48                                                 var miss = stack.pop
  49                                                 return new XMLError("Missing matching tag for `{miss.tag_name}`", location=miss.location)
  50                                         end
  51                                 else if tag isa XMLError then
  52                                         return tag
  53                                 else
  54                                         if stack.is_empty then
  55                                                 tag.parent = doc
  56                                         else
  57                                                 tag.parent = stack.last
  58                                         end
  59                                 end
  60                         else
  61                                 var st = pos
  62                                 var end_pc = ignore_until("<") - 1
  63                                 var pc = new PCDATA(src.substring(st, end_pc - st + 1).trim)
  64                                 if stack.is_empty then
  65                                         pc.parent = doc
  66                                 else
  67                                         pc.parent = stack.last
  68                                 end
  69                         end
  70                 end
  71                 if not stack.is_empty then
  72                         var miss = stack.pop
  73                         return new XMLError("Missing matching tag for `{miss.tag_name}`", location=miss.location)
  74                 end
  75                 return doc
  76         end
  77
  78         # Reads the tag starting in `src` at current position
  79         private fun read_tag: XMLEntity do
  80                 var st_loc = new Location(line, line_offset)
  81                 var c = src[pos]
  82                 if not c == '<' then return new XMLError(location=st_loc, "Expected start of tag, got `{c}`")
  83                 var st = pos
  84                 pos += 1
  85                 c = src[pos]
  86                 if c == '!' then
  87                         # Special tag
  88                         return read_special_tag(st_loc)
  89                 else if c == '?' then
  90                         # Prolog tag
  91                         return read_prolog_tag(st_loc)
  92                 else if c == '/' then
  93                         # End tag
  94                         return read_end_tag(st_loc)
  95                 else
  96                         # Start tag
  97                         return read_start_tag(st_loc)
  98                 end
  99         end
 100
 101         # Reads a Special tag (starting with <!)
 102         #
 103         # In case of error, returns a `XMLError`
 104         private fun read_special_tag(st_loc: Location): XMLEntity do
 105                 var srclen = src.length
 106                 pos += 1
 107                 if (pos + 2) >= srclen then return new XMLError(location=st_loc, "Unexpected EOF on start of Special tag")
 108                 if src[pos] == '-' and src[pos + 1] == '-' then
 109                         pos += 2
 110                         var comst = pos
 111                         var endcom = ignore_until("-->")
 112                         if endcom == -1 then return new XMLError(location=st_loc, "Malformatted comment")
 113                         pos += 3
 114                         return new XMLCommentTag(location=st_loc ,src.substring(comst, endcom - comst + 1))
 115                 end
 116                 var st = pos
 117                 if srclen - pos >= 7 then
 118                         var spe_type = src.substring(pos, 7)
 119                         if spe_type == "[CDATA[" then
 120                                 pos += 7
 121                                 var cdst = pos
 122                                 var cdend = ignore_until("]]>")
 123                                 pos += 3
 124                                 if pos >= srclen then return new XMLError(location = st_loc, "Unfinished CDATA block")
 125                                 return new CDATA(src.substring(cdst, cdend - cdst))
 126                         else if spe_type == "DOCTYPE" then
 127                                 pos += 7
 128                                 return parse_doctype(st_loc)
 129                         end
 130                 end
 131                 var end_spec = ignore_until(">")
 132                 pos += 1
 133                 return new XMLSpecialTag(location=st_loc, src.substring(st, end_spec - st))
 134         end
 135
 136         # Parse a Doctype declaration tag
 137         private fun parse_doctype(st_loc: Location): XMLEntity do
 138                 var elemts = new Array[String]
 139                 var srclen = src.length
 140                 loop
 141                         ignore_whitespaces
 142                         if pos >= srclen then return new XMLError(location = st_loc, "Malformatted doctype")
 143                         var c = src[pos]
 144                         # TODO: Properly support intern DOCTYPE definitions
 145                         if c == '[' then
 146                                 var intern_st = pos
 147                                 var intern_end = ignore_until("]")
 148                                 if intern_end == -1 then return new XMLError(location = st_loc, "Unfinished internal doctype declaration")
 149                                 pos += 1
 150                                 elemts.push src.substring(intern_st, intern_end - intern_st + 1)
 151                                 continue
 152                         end
 153                         var elm_st = pos
 154                         while pos < srclen and not src[pos].is_whitespace and src[pos] != '>' do pos += 1
 155                         if pos >= srclen then return new XMLError(location = st_loc, "Malformatted doctype")
 156                         if pos - elm_st > 1 then
 157                                 var str = src.substring(elm_st, pos - elm_st)
 158                                 elemts.push str
 159                         end
 160                         if src[pos] == '>' then
 161                                 pos += 1
 162                                 return new XMLDoctypeTag(location = st_loc, "DOCTYPE", elemts.join(" "))
 163                         end
 164                 end
 165         end
 166
 167         # Reads a Prolog or Processing Instruction tag (starting with <?)
 168         #
 169         # In case of error, returns a `XMLError`
 170         private fun read_prolog_tag(st_loc: Location): XMLEntity do
 171                 var srclen = src.length
 172                 pos += 1
 173                 if pos >= srclen then return new XMLError(location=st_loc, "Invalid start of prolog")
 174                 var idst = pos
 175                 var tag_name = parse_tag_name(['<', '>'])
 176                 var c = src[pos]
 177                 if c == '<' or c == '>' then return new XMLError(location=st_loc ,"Unexpected character `{c}` in prolog declaration")
 178                 if tag_name == "xml" then
 179                         var args = parse_args(['?'])
 180                         for i in args do
 181                                 if i isa BadXMLAttribute then return new XMLError(location = i.location, i.name)
 182                         end
 183                         if src[pos] == '?' then
 184                                 if src[pos + 1] == '>' then
 185                                         pos += 2
 186                                         return new XMLPrologTag(location=st_loc, tag_name, args)
 187                                 end
 188                         end
 189                 else
 190                         if tag_name.has("xml") then return new XMLError(location = st_loc, "Forbidden keyword xml in Processing Instruction")
 191                         var cont_st = pos
 192                         var cont_end = ignore_until("?>")
 193                         if cont_end == -1 then
 194                                 pos += 2
 195                                 return new XMLError(location = st_loc, "Malformatted Processing Instruction tag")
 196                         end
 197                         pos += 2
 198                         return new XMLProcessingInstructionTag(location=st_loc, tag_name, src.substring(cont_st, cont_end - cont_st))
 199                 end
 200                 pos += 1
 201                 return new XMLError(location=st_loc, "Malformatted prolog tag")
 202         end
 203
 204         # Reads an End tag (starting with </)
 205         #
 206         # In case of error, returns a `XMLError`
 207         private fun read_end_tag(st_loc: Location): XMLEntity do
 208                 var srclen = src.length
 209                 pos += 1
 210                 var tag_name = parse_tag_name(['<', '>'])
 211                 ignore_whitespaces
 212                 if src[pos] == '>' then
 213                         pos += 1
 214                         return new XMLEndTag(location=st_loc, tag_name)
 215                 end
 216                 return new XMLError(location = st_loc, "Bad end tag `{tag_name}`")
 217         end
 218
 219         # Reads a Start tag (starting with <)
 220         #
 221         # In case of error, returns a `XMLError`
 222         private fun read_start_tag(st_loc: Location): XMLEntity do
 223                 var srclen = src.length
 224                 var tag_name = parse_tag_name(['/', '>'])
 225                 var args = parse_args(['/', '>'])
 226                 for i in args do
 227                         if i isa BadXMLAttribute then return new XMLError(location=i.location, i.name)
 228                 end
 229                 if src[pos] == '/' then
 230                         if src[pos + 1] == '>' then
 231                                 pos += 2
 232                                 return new XMLOnelinerTag(location=st_loc, tag_name, args)
 233                         end
 234                 end
 235                 pos += 1
 236                 return new XMLStartTag(location=st_loc, tag_name, args)
 237         end
 238
 239         # Parses an xml tag name
 240         private fun parse_tag_name(delims: Array[Char]): String do
 241                 var idst = pos
 242                 var c = src[pos]
 243                 var srclen = src.length
 244                 while pos < srclen and not c.is_whitespace and not delims.has(c) do
 245                         pos += 1
 246                         c = src[pos]
 247                 end
 248                 return src.substring(idst, pos - idst).trim
 249         end
 250
 251         # Parse the arguments of a tag
 252         private fun parse_args(endtags: Array[Char]): Array[XMLAttribute] do
 253                 var attrs = new Array[XMLAttribute]
 254                 loop
 255                         var arg = parse_arg(endtags)
 256                         if arg isa XMLAttributeEnd then return attrs
 257                         attrs.add arg
 258                         if arg isa BadXMLAttribute then return attrs
 259                 end
 260         end
 261
 262         # Parses the next argument in `src`
 263         private fun parse_arg(endtags: Array[Char]): XMLAttribute do
 264                 var srclen = src.length
 265                 var attr: XMLAttribute
 266                 ignore_whitespaces
 267                 var st_loc = new Location(line, line_offset)
 268                 if pos >= srclen then return new BadXMLAttribute(location = st_loc, "Unfinished attribute name")
 269                 # FIXME: Ugly, but as long as it remains private, it is OK I guess
 270                 if endtags.has(src[pos]) then return new XMLAttributeEnd("")
 271                 var attrname_st = pos
 272                 while pos < srclen and src[pos] != '=' and not endtags.has(src[pos]) do pos += 1
 273                 if pos >= srclen then return new BadXMLAttribute(location = st_loc, "Unfinished attribute name")
 274                 if src[pos] != '=' then return new BadXMLAttribute(location = st_loc, "Malformatted attribute")
 275                 var attrname_end = pos - 1
 276                 var name = src.substring(attrname_st, attrname_end - attrname_st + 1).trim
 277                 pos += 1
 278                 ignore_whitespaces
 279                 var attrval_st = pos
 280                 if pos >= srclen then return new BadXMLAttribute(location=st_loc, "Unfinished attribute `{name}`")
 281                 var match = src[pos]
 282                 if match != '\'' and match != '"' then return new BadXMLAttribute(location=st_loc, "Invalid string delimiter `{match}` for attribute `{name}`")
 283                 pos += 1
 284                 while pos < srclen and src[pos] != match do pos += 1
 285                 if pos >= srclen then return new BadXMLAttribute(location=st_loc, "Unfinished attribute `{name}`")
 286                 var attrval_end = pos
 287                 var val = src.substring(attrval_st, attrval_end - attrval_st + 1).trim
 288                 pos += 1
 289                 return new XMLStringAttr(location=st_loc, name, val.substring(1, val.length - 2), match)
 290         end
 291 end
 292
 293 redef class Text
 294         # Tries to parse the current string to XML
 295         #
 296         # Returns an `XMLDocument` if successful, or an `XMLError` if not
 297         fun to_xml: XMLEntity do return (new XMLProcessor(self.to_s)).parse_document
 298 end