lib/dom: Introducing simple DOM XML parser
[nit.git] / lib / dom / parser.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # This file is free software, which comes along with NIT. This software is
4 # distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
5 # without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
6 # PARTICULAR PURPOSE. You can modify it is you want, provided this header
7 # is kept unaltered, and a notification of the changes is added.
8 # You are allowed to redistribute it and sell it, alone or is a part of
9 # another product.
10
11 # XML DOM-parsing facilities
12 module parser
13
14 intrude import parser_base
15 intrude import xml_entities
16
17 # Provides XML parsing facilities
18 class XMLProcessor
19 super StringProcessor
20
21 # Parses a full XML document
22 fun parse_document: XMLEntity do
23 var stack = new Array[XMLStartTag]
24 var doc = new XMLDocument
25 loop
26 ignore_whitespaces
27 if pos >= src.length then break
28 if src[pos] == '<' then
29 var tag = read_tag
30 if tag isa XMLStartTag then
31 if stack.is_empty then
32 tag.parent = doc
33 else
34 var st_last = stack.last
35 tag.parent = st_last
36 end
37 stack.push tag
38 else if tag isa XMLEndTag then
39 if stack.is_empty then
40 return new XMLError(location = tag.location, "Missing matching tag for `{tag.tag_name}`")
41 end
42 var st_last = stack.last
43 if tag.tag_name == st_last.tag_name then
44 st_last.matching = tag
45 tag.matching = st_last
46 stack.pop
47 else
48 var miss = stack.pop
49 return new XMLError("Missing matching tag for `{miss.tag_name}`", location=miss.location)
50 end
51 else if tag isa XMLError then
52 return tag
53 else
54 if stack.is_empty then
55 tag.parent = doc
56 else
57 tag.parent = stack.last
58 end
59 end
60 else
61 var st = pos
62 var end_pc = ignore_until("<") - 1
63 var pc = new PCDATA(src.substring(st, end_pc - st + 1).trim)
64 if stack.is_empty then
65 pc.parent = doc
66 else
67 pc.parent = stack.last
68 end
69 end
70 end
71 if not stack.is_empty then
72 var miss = stack.pop
73 return new XMLError("Missing matching tag for `{miss.tag_name}`", location=miss.location)
74 end
75 return doc
76 end
77
78 # Reads the tag starting in `src` at current position
79 private fun read_tag: XMLEntity do
80 var st_loc = new Location(line, line_offset)
81 var c = src[pos]
82 if not c == '<' then return new XMLError(location=st_loc, "Expected start of tag, got `{c}`")
83 var st = pos
84 pos += 1
85 c = src[pos]
86 if c == '!' then
87 # Special tag
88 return read_special_tag(st_loc)
89 else if c == '?' then
90 # Prolog tag
91 return read_prolog_tag(st_loc)
92 else if c == '/' then
93 # End tag
94 return read_end_tag(st_loc)
95 else
96 # Start tag
97 return read_start_tag(st_loc)
98 end
99 end
100
101 # Reads a Special tag (starting with <!)
102 #
103 # In case of error, returns a `XMLError`
104 private fun read_special_tag(st_loc: Location): XMLEntity do
105 var srclen = src.length
106 pos += 1
107 if (pos + 2) >= srclen then return new XMLError(location=st_loc, "Unexpected EOF on start of Special tag")
108 if src[pos] == '-' and src[pos + 1] == '-' then
109 pos += 2
110 var comst = pos
111 var endcom = ignore_until("-->")
112 if endcom == -1 then return new XMLError(location=st_loc, "Malformatted comment")
113 pos += 3
114 return new XMLCommentTag(location=st_loc ,src.substring(comst, endcom - comst + 1))
115 end
116 var st = pos
117 if srclen - pos >= 7 then
118 var spe_type = src.substring(pos, 7)
119 if spe_type == "[CDATA[" then
120 pos += 7
121 var cdst = pos
122 var cdend = ignore_until("]]>")
123 pos += 3
124 if pos >= srclen then return new XMLError(location = st_loc, "Unfinished CDATA block")
125 return new CDATA(src.substring(cdst, cdend - cdst))
126 else if spe_type == "DOCTYPE" then
127 pos += 7
128 return parse_doctype(st_loc)
129 end
130 end
131 var end_spec = ignore_until(">")
132 pos += 1
133 return new XMLSpecialTag(location=st_loc, src.substring(st, end_spec - st))
134 end
135
136 # Parse a Doctype declaration tag
137 private fun parse_doctype(st_loc: Location): XMLEntity do
138 var elemts = new Array[String]
139 var srclen = src.length
140 loop
141 ignore_whitespaces
142 if pos >= srclen then return new XMLError(location = st_loc, "Malformatted doctype")
143 var c = src[pos]
144 # TODO: Properly support intern DOCTYPE definitions
145 if c == '[' then
146 var intern_st = pos
147 var intern_end = ignore_until("]")
148 if intern_end == -1 then return new XMLError(location = st_loc, "Unfinished internal doctype declaration")
149 pos += 1
150 elemts.push src.substring(intern_st, intern_end - intern_st + 1)
151 continue
152 end
153 var elm_st = pos
154 while pos < srclen and not src[pos].is_whitespace and src[pos] != '>' do pos += 1
155 if pos >= srclen then return new XMLError(location = st_loc, "Malformatted doctype")
156 if pos - elm_st > 1 then
157 var str = src.substring(elm_st, pos - elm_st)
158 elemts.push str
159 end
160 if src[pos] == '>' then
161 pos += 1
162 return new XMLDoctypeTag(location = st_loc, "DOCTYPE", elemts.join(" "))
163 end
164 end
165 end
166
167 # Reads a Prolog or Processing Instruction tag (starting with <?)
168 #
169 # In case of error, returns a `XMLError`
170 private fun read_prolog_tag(st_loc: Location): XMLEntity do
171 var srclen = src.length
172 pos += 1
173 if pos >= srclen then return new XMLError(location=st_loc, "Invalid start of prolog")
174 var idst = pos
175 var tag_name = parse_tag_name(['<', '>'])
176 var c = src[pos]
177 if c == '<' or c == '>' then return new XMLError(location=st_loc ,"Unexpected character `{c}` in prolog declaration")
178 if tag_name == "xml" then
179 var args = parse_args(['?'])
180 for i in args do
181 if i isa BadXMLAttribute then return new XMLError(location = i.location, i.name)
182 end
183 if src[pos] == '?' then
184 if src[pos + 1] == '>' then
185 pos += 2
186 return new XMLPrologTag(location=st_loc, tag_name, args)
187 end
188 end
189 else
190 if tag_name.has("xml") then return new XMLError(location = st_loc, "Forbidden keyword xml in Processing Instruction")
191 var cont_st = pos
192 var cont_end = ignore_until("?>")
193 if cont_end == -1 then
194 pos += 2
195 return new XMLError(location = st_loc, "Malformatted Processing Instruction tag")
196 end
197 pos += 2
198 return new XMLProcessingInstructionTag(location=st_loc, tag_name, src.substring(cont_st, cont_end - cont_st))
199 end
200 pos += 1
201 return new XMLError(location=st_loc, "Malformatted prolog tag")
202 end
203
204 # Reads an End tag (starting with </)
205 #
206 # In case of error, returns a `XMLError`
207 private fun read_end_tag(st_loc: Location): XMLEntity do
208 var srclen = src.length
209 pos += 1
210 var tag_name = parse_tag_name(['<', '>'])
211 ignore_whitespaces
212 if src[pos] == '>' then
213 pos += 1
214 return new XMLEndTag(location=st_loc, tag_name)
215 end
216 return new XMLError(location = st_loc, "Bad end tag `{tag_name}`")
217 end
218
219 # Reads a Start tag (starting with <)
220 #
221 # In case of error, returns a `XMLError`
222 private fun read_start_tag(st_loc: Location): XMLEntity do
223 var srclen = src.length
224 var tag_name = parse_tag_name(['/', '>'])
225 var args = parse_args(['/', '>'])
226 for i in args do
227 if i isa BadXMLAttribute then return new XMLError(location=i.location, i.name)
228 end
229 if src[pos] == '/' then
230 if src[pos + 1] == '>' then
231 pos += 2
232 return new XMLOnelinerTag(location=st_loc, tag_name, args)
233 end
234 end
235 pos += 1
236 return new XMLStartTag(location=st_loc, tag_name, args)
237 end
238
239 # Parses an xml tag name
240 private fun parse_tag_name(delims: Array[Char]): String do
241 var idst = pos
242 var c = src[pos]
243 var srclen = src.length
244 while pos < srclen and not c.is_whitespace and not delims.has(c) do
245 pos += 1
246 c = src[pos]
247 end
248 return src.substring(idst, pos - idst).trim
249 end
250
251 # Parse the arguments of a tag
252 private fun parse_args(endtags: Array[Char]): Array[XMLAttribute] do
253 var attrs = new Array[XMLAttribute]
254 loop
255 var arg = parse_arg(endtags)
256 if arg isa XMLAttributeEnd then return attrs
257 attrs.add arg
258 if arg isa BadXMLAttribute then return attrs
259 end
260 end
261
262 # Parses the next argument in `src`
263 private fun parse_arg(endtags: Array[Char]): XMLAttribute do
264 var srclen = src.length
265 var attr: XMLAttribute
266 ignore_whitespaces
267 var st_loc = new Location(line, line_offset)
268 if pos >= srclen then return new BadXMLAttribute(location = st_loc, "Unfinished attribute name")
269 # FIXME: Ugly, but as long as it remains private, it is OK I guess
270 if endtags.has(src[pos]) then return new XMLAttributeEnd("")
271 var attrname_st = pos
272 while pos < srclen and src[pos] != '=' and not endtags.has(src[pos]) do pos += 1
273 if pos >= srclen then return new BadXMLAttribute(location = st_loc, "Unfinished attribute name")
274 if src[pos] != '=' then return new BadXMLAttribute(location = st_loc, "Malformatted attribute")
275 var attrname_end = pos - 1
276 var name = src.substring(attrname_st, attrname_end - attrname_st + 1).trim
277 pos += 1
278 ignore_whitespaces
279 var attrval_st = pos
280 if pos >= srclen then return new BadXMLAttribute(location=st_loc, "Unfinished attribute `{name}`")
281 var match = src[pos]
282 if match != '\'' and match != '"' then return new BadXMLAttribute(location=st_loc, "Invalid string delimiter `{match}` for attribute `{name}`")
283 pos += 1
284 while pos < srclen and src[pos] != match do pos += 1
285 if pos >= srclen then return new BadXMLAttribute(location=st_loc, "Unfinished attribute `{name}`")
286 var attrval_end = pos
287 var val = src.substring(attrval_st, attrval_end - attrval_st + 1).trim
288 pos += 1
289 return new XMLStringAttr(location=st_loc, name, val.substring(1, val.length - 2), match)
290 end
291 end
292
293 redef class Text
294 # Tries to parse the current string to XML
295 #
296 # Returns an `XMLDocument` if successful, or an `XMLError` if not
297 fun to_xml: XMLEntity do return (new XMLProcessor(self.to_s)).parse_document
298 end