7ebe6b0dc0d844539b7d249194a8ffe49bfc6577
[nit.git] / lib / dom / parser.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # This file is free software, which comes along with NIT. This software is
4 # distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
5 # without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
6 # PARTICULAR PURPOSE. You can modify it is you want, provided this header
7 # is kept unaltered, and a notification of the changes is added.
8 # You are allowed to redistribute it and sell it, alone or is a part of
9 # another product.
10
11 # XML DOM-parsing facilities
12 module parser
13
14 intrude import parser_base
15 intrude import xml_entities
16
17 # Provides XML parsing facilities
18 class XMLProcessor
19 super StringProcessor
20
21 # Parses a full XML document
22 fun parse_document: XMLEntity do
23 var stack = new Array[XMLStartTag]
24 var doc = new XMLDocument
25 loop
26 ignore_whitespaces
27 if pos >= src.length then break
28 if src[pos] == '<' then
29 var tag = read_tag
30 if tag isa XMLStartTag then
31 if stack.is_empty then
32 tag.parent = doc
33 else
34 var st_last = stack.last
35 tag.parent = st_last
36 end
37 stack.push tag
38 else if tag isa XMLEndTag then
39 if stack.is_empty then
40 return new XMLError(tag.location, "Missing matching tag for `{tag.tag_name}`")
41 end
42 var st_last = stack.last
43 if tag.tag_name == st_last.tag_name then
44 st_last.matching = tag
45 tag.matching = st_last
46 stack.pop
47 else
48 var miss = stack.pop
49 return new XMLError(miss.location, "Missing matching tag for `{miss.tag_name}`")
50 end
51 else if tag isa XMLError then
52 return tag
53 else
54 if stack.is_empty then
55 tag.parent = doc
56 else
57 tag.parent = stack.last
58 end
59 end
60 else
61 var st = pos
62 var end_pc = ignore_until("<") - 1
63 var loc = new Location(line, line_offset)
64 var pc = new PCDATA(loc, src.substring(st, end_pc - st + 1).trim)
65 if stack.is_empty then
66 pc.parent = doc
67 else
68 pc.parent = stack.last
69 end
70 end
71 end
72 if not stack.is_empty then
73 var miss = stack.pop
74 return new XMLError(miss.location, "Missing matching tag for `{miss.tag_name}`")
75 end
76 return doc
77 end
78
79 # Reads the tag starting in `src` at current position
80 private fun read_tag: XMLEntity do
81 var st_loc = new Location(line, line_offset)
82 var c = src[pos]
83 if not c == '<' then return new XMLError(st_loc, "Expected start of tag, got `{c}`")
84 pos += 1
85 if pos >= src.length then return new XMLError(st_loc, "Malformed tag")
86 c = src[pos]
87 if c == '!' then
88 # Special tag
89 return read_special_tag(st_loc)
90 else if c == '?' then
91 # Prolog tag
92 return read_prolog_tag(st_loc)
93 else if c == '/' then
94 # End tag
95 return read_end_tag(st_loc)
96 else
97 # Start tag
98 return read_start_tag(st_loc)
99 end
100 end
101
102 # Reads a Special tag (starting with <!)
103 #
104 # In case of error, returns a `XMLError`
105 private fun read_special_tag(st_loc: Location): XMLEntity do
106 var srclen = src.length
107 pos += 1
108 if (pos + 2) >= srclen then return new XMLError(st_loc, "Unexpected EOF on start of Special tag")
109 if src[pos] == '-' and src[pos + 1] == '-' then
110 pos += 2
111 var comst = pos
112 var endcom = ignore_until("-->")
113 if endcom == -1 then return new XMLError(st_loc, "Malformed comment")
114 pos += 3
115 return new XMLCommentTag(st_loc ,src.substring(comst, endcom - comst + 1))
116 end
117 var st = pos
118 if srclen - pos >= 7 then
119 var spe_type = src.substring(pos, 7)
120 if spe_type == "[CDATA[" then
121 pos += 7
122 var cdst = pos
123 var cdend = ignore_until("]]>")
124 pos += 3
125 if pos >= srclen then return new XMLError(st_loc, "Unfinished CDATA block")
126 return new CDATA(st_loc, src.substring(cdst, cdend - cdst))
127 else if spe_type == "DOCTYPE" then
128 pos += 7
129 return parse_doctype(st_loc)
130 end
131 end
132 var end_spec = ignore_until(">")
133 pos += 1
134 return new XMLSpecialTag(st_loc, src.substring(st, end_spec - st))
135 end
136
137 # Parse a Doctype declaration tag
138 private fun parse_doctype(st_loc: Location): XMLEntity do
139 var elemts = new Array[String]
140 var srclen = src.length
141 loop
142 ignore_whitespaces
143 if pos >= srclen then return new XMLError(st_loc, "Malformed doctype")
144 var c = src[pos]
145 # TODO: Properly support intern DOCTYPE definitions
146 if c == '[' then
147 var intern_st = pos
148 var intern_end = ignore_until("]")
149 if intern_end == -1 then return new XMLError(st_loc, "Unfinished internal doctype declaration")
150 pos += 1
151 elemts.push src.substring(intern_st, intern_end - intern_st + 1)
152 continue
153 end
154 var elm_st = pos
155 while pos < srclen and not src[pos].is_whitespace and src[pos] != '>' do pos += 1
156 if pos >= srclen then return new XMLError(st_loc, "Malformed doctype")
157 if pos - elm_st > 1 then
158 var str = src.substring(elm_st, pos - elm_st)
159 elemts.push str
160 end
161 if src[pos] == '>' then
162 pos += 1
163 return new XMLDoctypeTag(st_loc, "DOCTYPE", elemts.join(" "))
164 end
165 end
166 end
167
168 # Reads a Prolog or Processing Instruction tag (starting with <?)
169 #
170 # In case of error, returns a `XMLError`
171 private fun read_prolog_tag(st_loc: Location): XMLEntity do
172 var srclen = src.length
173 pos += 1
174 if pos >= srclen then return new XMLError(st_loc, "Invalid start of prolog")
175 var tag_name = parse_tag_name(['<', '>'])
176 var c = src[pos]
177 if c == '<' or c == '>' then return new XMLError(st_loc ,"Unexpected character `{c}` in prolog declaration")
178 if tag_name == "xml" then
179 var args = parse_args(['?'])
180 for i in args do
181 if i isa BadXMLAttribute then return new XMLError(i.location, i.name)
182 end
183 if src[pos] == '?' then
184 if src[pos + 1] == '>' then
185 pos += 2
186 return new XMLPrologTag(st_loc, tag_name, args)
187 end
188 end
189 else
190 if tag_name.has("xml") then return new XMLError(st_loc, "Forbidden keyword xml in Processing Instruction")
191 var cont_st = pos
192 var cont_end = ignore_until("?>")
193 if cont_end == -1 then
194 pos += 2
195 return new XMLError(st_loc, "Malformed Processing Instruction tag")
196 end
197 pos += 2
198 return new XMLProcessingInstructionTag(st_loc, tag_name, src.substring(cont_st, cont_end - cont_st))
199 end
200 pos += 1
201 return new XMLError(st_loc, "Malformed prolog tag")
202 end
203
204 # Reads an End tag (starting with </)
205 #
206 # In case of error, returns a `XMLError`
207 private fun read_end_tag(st_loc: Location): XMLEntity do
208 pos += 1
209 var tag_name = parse_tag_name(['<', '>'])
210 ignore_whitespaces
211 if src[pos] == '>' then
212 pos += 1
213 return new XMLEndTag(st_loc, tag_name)
214 end
215 return new XMLError(st_loc, "Bad end tag `{tag_name}`")
216 end
217
218 # Reads a Start tag (starting with <)
219 #
220 # In case of error, returns a `XMLError`
221 private fun read_start_tag(st_loc: Location): XMLEntity do
222 var tag_name = parse_tag_name(['/', '>'])
223 var args = parse_args(['/', '>'])
224 for i in args do
225 if i isa BadXMLAttribute then return new XMLError(i.location, i.name)
226 end
227 if src[pos] == '/' then
228 if src[pos + 1] == '>' then
229 pos += 2
230 return new XMLOnelinerTag(st_loc, tag_name, args)
231 end
232 end
233 pos += 1
234 return new XMLStartTag(st_loc, tag_name, args)
235 end
236
237 # Parses an xml tag name
238 private fun parse_tag_name(delims: Array[Char]): String do
239 var idst = pos
240 var srclen = src.length
241 while pos < srclen do
242 var c = src[pos]
243 if c.is_whitespace or delims.has(c) then break
244 pos += 1
245 end
246 return src.substring(idst, pos - idst).trim
247 end
248
249 # Parse the arguments of a tag
250 private fun parse_args(endtags: Array[Char]): Array[XMLAttribute] do
251 var attrs = new Array[XMLAttribute]
252 loop
253 var arg = parse_arg(endtags)
254 if arg isa XMLAttributeEnd then return attrs
255 attrs.add arg
256 if arg isa BadXMLAttribute then return attrs
257 end
258 end
259
260 # Parses the next argument in `src`
261 private fun parse_arg(endtags: Array[Char]): XMLAttribute do
262 var srclen = src.length
263 ignore_whitespaces
264 var st_loc = new Location(line, line_offset)
265 if pos >= srclen then return new BadXMLAttribute(st_loc, "Unfinished attribute name")
266 # FIXME: Ugly, but as long as it remains private, it is OK I guess
267 if endtags.has(src[pos]) then return new XMLAttributeEnd(st_loc, "")
268 var attrname_st = pos
269 while pos < srclen and src[pos] != '=' and not endtags.has(src[pos]) do pos += 1
270 if pos >= srclen then return new BadXMLAttribute(st_loc, "Unfinished attribute name")
271 if src[pos] != '=' then return new BadXMLAttribute(st_loc, "Malformed attribute")
272 var attrname_end = pos - 1
273 var name = src.substring(attrname_st, attrname_end - attrname_st + 1).trim
274 pos += 1
275 ignore_whitespaces
276 var attrval_st = pos
277 if pos >= srclen then return new BadXMLAttribute(st_loc, "Unfinished attribute `{name}`")
278 var match = src[pos]
279 if match != '\'' and match != '"' then return new BadXMLAttribute(st_loc, "Invalid string delimiter `{match}` for attribute `{name}`")
280 pos += 1
281 while pos < srclen and src[pos] != match do pos += 1
282 if pos >= srclen then return new BadXMLAttribute(st_loc, "Unfinished attribute `{name}`")
283 var attrval_end = pos
284 var val = src.substring(attrval_st, attrval_end - attrval_st + 1).trim
285 pos += 1
286 return new XMLStringAttr(st_loc, name, val.substring(1, val.length - 2), match)
287 end
288 end
289
290 redef class Text
291 # Tries to parse the current string to XML
292 #
293 # Returns an `XMLDocument` if successful, or an `XMLError` if not
294 fun to_xml: XMLEntity do return (new XMLProcessor(self.to_s)).parse_document
295 end