lib/dom/parser.nit

   1 # This file is part of NIT ( http://www.nitlanguage.org ).
   2 #
   3 # This file is free software, which comes along with NIT.  This software is
   4 # distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
   5 # without  even  the implied warranty of  MERCHANTABILITY or  FITNESS FOR A
   6 # PARTICULAR PURPOSE.  You can modify it is you want,  provided this header
   7 # is kept unaltered, and a notification of the changes is added.
   8 # You  are  allowed  to  redistribute it and sell it, alone or is a part of
   9 # another product.
  10
  11 # XML DOM-parsing facilities
  12 module parser
  13
  14 intrude import parser_base
  15 intrude import xml_entities
  16
  17 # Provides XML parsing facilities
  18 class XMLProcessor
  19         super StringProcessor
  20
  21         # Parses a full XML document
  22         fun parse_document: XMLEntity do
  23                 var stack = new Array[XMLStartTag]
  24                 var doc = new XMLDocument
  25                 loop
  26                         ignore_whitespaces
  27                         if pos >= src.length then break
  28                         if src[pos] == '<' then
  29                                 var tag = read_tag
  30                                 if tag isa XMLStartTag then
  31                                         if stack.is_empty then
  32                                                 tag.parent = doc
  33                                         else
  34                                                 var st_last = stack.last
  35                                                 tag.parent = st_last
  36                                         end
  37                                         stack.push tag
  38                                 else if tag isa XMLEndTag then
  39                                         if stack.is_empty then
  40                                                 return new XMLError(tag.location, "Missing matching tag for `{tag.tag_name}`")
  41                                         end
  42                                         var st_last = stack.last
  43                                         if tag.tag_name == st_last.tag_name then
  44                                                 st_last.matching = tag
  45                                                 tag.matching = st_last
  46                                                 stack.pop
  47                                         else
  48                                                 var miss = stack.pop
  49                                                 return new XMLError(miss.location, "Missing matching tag for `{miss.tag_name}`")
  50                                         end
  51                                 else if tag isa XMLError then
  52                                         return tag
  53                                 else
  54                                         if stack.is_empty then
  55                                                 tag.parent = doc
  56                                         else
  57                                                 tag.parent = stack.last
  58                                         end
  59                                 end
  60                         else
  61                                 var st = pos
  62                                 var end_pc = ignore_until("<") - 1
  63                                 var loc = new Location(line, line_offset)
  64                                 var pc = new PCDATA(loc, src.substring(st, end_pc - st + 1).trim)
  65                                 if stack.is_empty then
  66                                         pc.parent = doc
  67                                 else
  68                                         pc.parent = stack.last
  69                                 end
  70                         end
  71                 end
  72                 if not stack.is_empty then
  73                         var miss = stack.pop
  74                         return new XMLError(miss.location, "Missing matching tag for `{miss.tag_name}`")
  75                 end
  76                 return doc
  77         end
  78
  79         # Reads the tag starting in `src` at current position
  80         private fun read_tag: XMLEntity do
  81                 var st_loc = new Location(line, line_offset)
  82                 var c = src[pos]
  83                 if not c == '<' then return new XMLError(st_loc, "Expected start of tag, got `{c}`")
  84                 pos += 1
  85                 c = src[pos]
  86                 if c == '!' then
  87                         # Special tag
  88                         return read_special_tag(st_loc)
  89                 else if c == '?' then
  90                         # Prolog tag
  91                         return read_prolog_tag(st_loc)
  92                 else if c == '/' then
  93                         # End tag
  94                         return read_end_tag(st_loc)
  95                 else
  96                         # Start tag
  97                         return read_start_tag(st_loc)
  98                 end
  99         end
 100
 101         # Reads a Special tag (starting with <!)
 102         #
 103         # In case of error, returns a `XMLError`
 104         private fun read_special_tag(st_loc: Location): XMLEntity do
 105                 var srclen = src.length
 106                 pos += 1
 107                 if (pos + 2) >= srclen then return new XMLError(st_loc, "Unexpected EOF on start of Special tag")
 108                 if src[pos] == '-' and src[pos + 1] == '-' then
 109                         pos += 2
 110                         var comst = pos
 111                         var endcom = ignore_until("-->")
 112                         if endcom == -1 then return new XMLError(st_loc, "Malformed comment")
 113                         pos += 3
 114                         return new XMLCommentTag(st_loc ,src.substring(comst, endcom - comst + 1))
 115                 end
 116                 var st = pos
 117                 if srclen - pos >= 7 then
 118                         var spe_type = src.substring(pos, 7)
 119                         if spe_type == "[CDATA[" then
 120                                 pos += 7
 121                                 var cdst = pos
 122                                 var cdend = ignore_until("]]>")
 123                                 pos += 3
 124                                 if pos >= srclen then return new XMLError(st_loc, "Unfinished CDATA block")
 125                                 return new CDATA(st_loc, src.substring(cdst, cdend - cdst))
 126                         else if spe_type == "DOCTYPE" then
 127                                 pos += 7
 128                                 return parse_doctype(st_loc)
 129                         end
 130                 end
 131                 var end_spec = ignore_until(">")
 132                 pos += 1
 133                 return new XMLSpecialTag(st_loc, src.substring(st, end_spec - st))
 134         end
 135
 136         # Parse a Doctype declaration tag
 137         private fun parse_doctype(st_loc: Location): XMLEntity do
 138                 var elemts = new Array[String]
 139                 var srclen = src.length
 140                 loop
 141                         ignore_whitespaces
 142                         if pos >= srclen then return new XMLError(st_loc, "Malformed doctype")
 143                         var c = src[pos]
 144                         # TODO: Properly support intern DOCTYPE definitions
 145                         if c == '[' then
 146                                 var intern_st = pos
 147                                 var intern_end = ignore_until("]")
 148                                 if intern_end == -1 then return new XMLError(st_loc, "Unfinished internal doctype declaration")
 149                                 pos += 1
 150                                 elemts.push src.substring(intern_st, intern_end - intern_st + 1)
 151                                 continue
 152                         end
 153                         var elm_st = pos
 154                         while pos < srclen and not src[pos].is_whitespace and src[pos] != '>' do pos += 1
 155                         if pos >= srclen then return new XMLError(st_loc, "Malformed doctype")
 156                         if pos - elm_st > 1 then
 157                                 var str = src.substring(elm_st, pos - elm_st)
 158                                 elemts.push str
 159                         end
 160                         if src[pos] == '>' then
 161                                 pos += 1
 162                                 return new XMLDoctypeTag(st_loc, "DOCTYPE", elemts.join(" "))
 163                         end
 164                 end
 165         end
 166
 167         # Reads a Prolog or Processing Instruction tag (starting with <?)
 168         #
 169         # In case of error, returns a `XMLError`
 170         private fun read_prolog_tag(st_loc: Location): XMLEntity do
 171                 var srclen = src.length
 172                 pos += 1
 173                 if pos >= srclen then return new XMLError(st_loc, "Invalid start of prolog")
 174                 var tag_name = parse_tag_name(['<', '>'])
 175                 var c = src[pos]
 176                 if c == '<' or c == '>' then return new XMLError(st_loc ,"Unexpected character `{c}` in prolog declaration")
 177                 if tag_name == "xml" then
 178                         var args = parse_args(['?'])
 179                         for i in args do
 180                                 if i isa BadXMLAttribute then return new XMLError(i.location, i.name)
 181                         end
 182                         if src[pos] == '?' then
 183                                 if src[pos + 1] == '>' then
 184                                         pos += 2
 185                                         return new XMLPrologTag(st_loc, tag_name, args)
 186                                 end
 187                         end
 188                 else
 189                         if tag_name.has("xml") then return new XMLError(st_loc, "Forbidden keyword xml in Processing Instruction")
 190                         var cont_st = pos
 191                         var cont_end = ignore_until("?>")
 192                         if cont_end == -1 then
 193                                 pos += 2
 194                                 return new XMLError(st_loc, "Malformed Processing Instruction tag")
 195                         end
 196                         pos += 2
 197                         return new XMLProcessingInstructionTag(st_loc, tag_name, src.substring(cont_st, cont_end - cont_st))
 198                 end
 199                 pos += 1
 200                 return new XMLError(st_loc, "Malformed prolog tag")
 201         end
 202
 203         # Reads an End tag (starting with </)
 204         #
 205         # In case of error, returns a `XMLError`
 206         private fun read_end_tag(st_loc: Location): XMLEntity do
 207                 pos += 1
 208                 var tag_name = parse_tag_name(['<', '>'])
 209                 ignore_whitespaces
 210                 if src[pos] == '>' then
 211                         pos += 1
 212                         return new XMLEndTag(st_loc, tag_name)
 213                 end
 214                 return new XMLError(st_loc, "Bad end tag `{tag_name}`")
 215         end
 216
 217         # Reads a Start tag (starting with <)
 218         #
 219         # In case of error, returns a `XMLError`
 220         private fun read_start_tag(st_loc: Location): XMLEntity do
 221                 var tag_name = parse_tag_name(['/', '>'])
 222                 var args = parse_args(['/', '>'])
 223                 for i in args do
 224                         if i isa BadXMLAttribute then return new XMLError(i.location, i.name)
 225                 end
 226                 if src[pos] == '/' then
 227                         if src[pos + 1] == '>' then
 228                                 pos += 2
 229                                 return new XMLOnelinerTag(st_loc, tag_name, args)
 230                         end
 231                 end
 232                 pos += 1
 233                 return new XMLStartTag(st_loc, tag_name, args)
 234         end
 235
 236         # Parses an xml tag name
 237         private fun parse_tag_name(delims: Array[Char]): String do
 238                 var idst = pos
 239                 var c = src[pos]
 240                 var srclen = src.length
 241                 while pos < srclen and not c.is_whitespace and not delims.has(c) do
 242                         pos += 1
 243                         c = src[pos]
 244                 end
 245                 return src.substring(idst, pos - idst).trim
 246         end
 247
 248         # Parse the arguments of a tag
 249         private fun parse_args(endtags: Array[Char]): Array[XMLAttribute] do
 250                 var attrs = new Array[XMLAttribute]
 251                 loop
 252                         var arg = parse_arg(endtags)
 253                         if arg isa XMLAttributeEnd then return attrs
 254                         attrs.add arg
 255                         if arg isa BadXMLAttribute then return attrs
 256                 end
 257         end
 258
 259         # Parses the next argument in `src`
 260         private fun parse_arg(endtags: Array[Char]): XMLAttribute do
 261                 var srclen = src.length
 262                 ignore_whitespaces
 263                 var st_loc = new Location(line, line_offset)
 264                 if pos >= srclen then return new BadXMLAttribute(st_loc, "Unfinished attribute name")
 265                 # FIXME: Ugly, but as long as it remains private, it is OK I guess
 266                 if endtags.has(src[pos]) then return new XMLAttributeEnd(st_loc, "")
 267                 var attrname_st = pos
 268                 while pos < srclen and src[pos] != '=' and not endtags.has(src[pos]) do pos += 1
 269                 if pos >= srclen then return new BadXMLAttribute(st_loc, "Unfinished attribute name")
 270                 if src[pos] != '=' then return new BadXMLAttribute(st_loc, "Malformed attribute")
 271                 var attrname_end = pos - 1
 272                 var name = src.substring(attrname_st, attrname_end - attrname_st + 1).trim
 273                 pos += 1
 274                 ignore_whitespaces
 275                 var attrval_st = pos
 276                 if pos >= srclen then return new BadXMLAttribute(st_loc, "Unfinished attribute `{name}`")
 277                 var match = src[pos]
 278                 if match != '\'' and match != '"' then return new BadXMLAttribute(st_loc, "Invalid string delimiter `{match}` for attribute `{name}`")
 279                 pos += 1
 280                 while pos < srclen and src[pos] != match do pos += 1
 281                 if pos >= srclen then return new BadXMLAttribute(st_loc, "Unfinished attribute `{name}`")
 282                 var attrval_end = pos
 283                 var val = src.substring(attrval_st, attrval_end - attrval_st + 1).trim
 284                 pos += 1
 285                 return new XMLStringAttr(st_loc, name, val.substring(1, val.length - 2), match)
 286         end
 287 end
 288
 289 redef class Text
 290         # Tries to parse the current string to XML
 291         #
 292         # Returns an `XMLDocument` if successful, or an `XMLError` if not
 293         fun to_xml: XMLEntity do return (new XMLProcessor(self.to_s)).parse_document
 294 end