nitcatalog: adapt to new loader API
[nit.git] / lib / dom / parser.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # This file is free software, which comes along with NIT. This software is
4 # distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
5 # without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
6 # PARTICULAR PURPOSE. You can modify it is you want, provided this header
7 # is kept unaltered, and a notification of the changes is added.
8 # You are allowed to redistribute it and sell it, alone or is a part of
9 # another product.
10
11 # XML DOM-parsing facilities
12 module parser
13
14 intrude import parser_base
15 intrude import xml_entities
16
17 # Provides XML parsing facilities
18 class XMLProcessor
19 super StringProcessor
20
21 # Parses a full XML document
22 fun parse_document: XMLEntity do
23 var stack = new Array[XMLStartTag]
24 var doc = new XMLDocument
25 loop
26 ignore_whitespaces
27 if pos >= src.length then break
28 if src[pos] == '<' then
29 var tag = read_tag
30 if tag isa XMLStartTag then
31 if stack.is_empty then
32 tag.parent = doc
33 else
34 var st_last = stack.last
35 tag.parent = st_last
36 end
37 stack.push tag
38 else if tag isa XMLEndTag then
39 if stack.is_empty then
40 return new XMLError(tag.location, "Missing matching tag for `{tag.tag_name}`")
41 end
42 var st_last = stack.last
43 if tag.tag_name == st_last.tag_name then
44 st_last.matching = tag
45 tag.matching = st_last
46 stack.pop
47 else
48 var miss = stack.pop
49 return new XMLError(miss.location, "Missing matching tag for `{miss.tag_name}`")
50 end
51 else if tag isa XMLError then
52 return tag
53 else
54 if stack.is_empty then
55 tag.parent = doc
56 else
57 tag.parent = stack.last
58 end
59 end
60 else
61 var st = pos
62 var end_pc = ignore_until("<") - 1
63 var loc = new Location(line, line_offset)
64 var pc = new PCDATA(loc, src.substring(st, end_pc - st + 1).trim)
65 if stack.is_empty then
66 pc.parent = doc
67 else
68 pc.parent = stack.last
69 end
70 end
71 end
72 if not stack.is_empty then
73 var miss = stack.pop
74 return new XMLError(miss.location, "Missing matching tag for `{miss.tag_name}`")
75 end
76 return doc
77 end
78
79 # Reads the tag starting in `src` at current position
80 private fun read_tag: XMLEntity do
81 var st_loc = new Location(line, line_offset)
82 var c = src[pos]
83 if not c == '<' then return new XMLError(st_loc, "Expected start of tag, got `{c}`")
84 pos += 1
85 c = src[pos]
86 if c == '!' then
87 # Special tag
88 return read_special_tag(st_loc)
89 else if c == '?' then
90 # Prolog tag
91 return read_prolog_tag(st_loc)
92 else if c == '/' then
93 # End tag
94 return read_end_tag(st_loc)
95 else
96 # Start tag
97 return read_start_tag(st_loc)
98 end
99 end
100
101 # Reads a Special tag (starting with <!)
102 #
103 # In case of error, returns a `XMLError`
104 private fun read_special_tag(st_loc: Location): XMLEntity do
105 var srclen = src.length
106 pos += 1
107 if (pos + 2) >= srclen then return new XMLError(st_loc, "Unexpected EOF on start of Special tag")
108 if src[pos] == '-' and src[pos + 1] == '-' then
109 pos += 2
110 var comst = pos
111 var endcom = ignore_until("-->")
112 if endcom == -1 then return new XMLError(st_loc, "Malformed comment")
113 pos += 3
114 return new XMLCommentTag(st_loc ,src.substring(comst, endcom - comst + 1))
115 end
116 var st = pos
117 if srclen - pos >= 7 then
118 var spe_type = src.substring(pos, 7)
119 if spe_type == "[CDATA[" then
120 pos += 7
121 var cdst = pos
122 var cdend = ignore_until("]]>")
123 pos += 3
124 if pos >= srclen then return new XMLError(st_loc, "Unfinished CDATA block")
125 return new CDATA(st_loc, src.substring(cdst, cdend - cdst))
126 else if spe_type == "DOCTYPE" then
127 pos += 7
128 return parse_doctype(st_loc)
129 end
130 end
131 var end_spec = ignore_until(">")
132 pos += 1
133 return new XMLSpecialTag(st_loc, src.substring(st, end_spec - st))
134 end
135
136 # Parse a Doctype declaration tag
137 private fun parse_doctype(st_loc: Location): XMLEntity do
138 var elemts = new Array[String]
139 var srclen = src.length
140 loop
141 ignore_whitespaces
142 if pos >= srclen then return new XMLError(st_loc, "Malformed doctype")
143 var c = src[pos]
144 # TODO: Properly support intern DOCTYPE definitions
145 if c == '[' then
146 var intern_st = pos
147 var intern_end = ignore_until("]")
148 if intern_end == -1 then return new XMLError(st_loc, "Unfinished internal doctype declaration")
149 pos += 1
150 elemts.push src.substring(intern_st, intern_end - intern_st + 1)
151 continue
152 end
153 var elm_st = pos
154 while pos < srclen and not src[pos].is_whitespace and src[pos] != '>' do pos += 1
155 if pos >= srclen then return new XMLError(st_loc, "Malformed doctype")
156 if pos - elm_st > 1 then
157 var str = src.substring(elm_st, pos - elm_st)
158 elemts.push str
159 end
160 if src[pos] == '>' then
161 pos += 1
162 return new XMLDoctypeTag(st_loc, "DOCTYPE", elemts.join(" "))
163 end
164 end
165 end
166
167 # Reads a Prolog or Processing Instruction tag (starting with <?)
168 #
169 # In case of error, returns a `XMLError`
170 private fun read_prolog_tag(st_loc: Location): XMLEntity do
171 var srclen = src.length
172 pos += 1
173 if pos >= srclen then return new XMLError(st_loc, "Invalid start of prolog")
174 var tag_name = parse_tag_name(['<', '>'])
175 var c = src[pos]
176 if c == '<' or c == '>' then return new XMLError(st_loc ,"Unexpected character `{c}` in prolog declaration")
177 if tag_name == "xml" then
178 var args = parse_args(['?'])
179 for i in args do
180 if i isa BadXMLAttribute then return new XMLError(i.location, i.name)
181 end
182 if src[pos] == '?' then
183 if src[pos + 1] == '>' then
184 pos += 2
185 return new XMLPrologTag(st_loc, tag_name, args)
186 end
187 end
188 else
189 if tag_name.has("xml") then return new XMLError(st_loc, "Forbidden keyword xml in Processing Instruction")
190 var cont_st = pos
191 var cont_end = ignore_until("?>")
192 if cont_end == -1 then
193 pos += 2
194 return new XMLError(st_loc, "Malformed Processing Instruction tag")
195 end
196 pos += 2
197 return new XMLProcessingInstructionTag(st_loc, tag_name, src.substring(cont_st, cont_end - cont_st))
198 end
199 pos += 1
200 return new XMLError(st_loc, "Malformed prolog tag")
201 end
202
203 # Reads an End tag (starting with </)
204 #
205 # In case of error, returns a `XMLError`
206 private fun read_end_tag(st_loc: Location): XMLEntity do
207 pos += 1
208 var tag_name = parse_tag_name(['<', '>'])
209 ignore_whitespaces
210 if src[pos] == '>' then
211 pos += 1
212 return new XMLEndTag(st_loc, tag_name)
213 end
214 return new XMLError(st_loc, "Bad end tag `{tag_name}`")
215 end
216
217 # Reads a Start tag (starting with <)
218 #
219 # In case of error, returns a `XMLError`
220 private fun read_start_tag(st_loc: Location): XMLEntity do
221 var tag_name = parse_tag_name(['/', '>'])
222 var args = parse_args(['/', '>'])
223 for i in args do
224 if i isa BadXMLAttribute then return new XMLError(i.location, i.name)
225 end
226 if src[pos] == '/' then
227 if src[pos + 1] == '>' then
228 pos += 2
229 return new XMLOnelinerTag(st_loc, tag_name, args)
230 end
231 end
232 pos += 1
233 return new XMLStartTag(st_loc, tag_name, args)
234 end
235
236 # Parses an xml tag name
237 private fun parse_tag_name(delims: Array[Char]): String do
238 var idst = pos
239 var c = src[pos]
240 var srclen = src.length
241 while pos < srclen and not c.is_whitespace and not delims.has(c) do
242 pos += 1
243 c = src[pos]
244 end
245 return src.substring(idst, pos - idst).trim
246 end
247
248 # Parse the arguments of a tag
249 private fun parse_args(endtags: Array[Char]): Array[XMLAttribute] do
250 var attrs = new Array[XMLAttribute]
251 loop
252 var arg = parse_arg(endtags)
253 if arg isa XMLAttributeEnd then return attrs
254 attrs.add arg
255 if arg isa BadXMLAttribute then return attrs
256 end
257 end
258
259 # Parses the next argument in `src`
260 private fun parse_arg(endtags: Array[Char]): XMLAttribute do
261 var srclen = src.length
262 ignore_whitespaces
263 var st_loc = new Location(line, line_offset)
264 if pos >= srclen then return new BadXMLAttribute(st_loc, "Unfinished attribute name")
265 # FIXME: Ugly, but as long as it remains private, it is OK I guess
266 if endtags.has(src[pos]) then return new XMLAttributeEnd(st_loc, "")
267 var attrname_st = pos
268 while pos < srclen and src[pos] != '=' and not endtags.has(src[pos]) do pos += 1
269 if pos >= srclen then return new BadXMLAttribute(st_loc, "Unfinished attribute name")
270 if src[pos] != '=' then return new BadXMLAttribute(st_loc, "Malformed attribute")
271 var attrname_end = pos - 1
272 var name = src.substring(attrname_st, attrname_end - attrname_st + 1).trim
273 pos += 1
274 ignore_whitespaces
275 var attrval_st = pos
276 if pos >= srclen then return new BadXMLAttribute(st_loc, "Unfinished attribute `{name}`")
277 var match = src[pos]
278 if match != '\'' and match != '"' then return new BadXMLAttribute(st_loc, "Invalid string delimiter `{match}` for attribute `{name}`")
279 pos += 1
280 while pos < srclen and src[pos] != match do pos += 1
281 if pos >= srclen then return new BadXMLAttribute(st_loc, "Unfinished attribute `{name}`")
282 var attrval_end = pos
283 var val = src.substring(attrval_st, attrval_end - attrval_st + 1).trim
284 pos += 1
285 return new XMLStringAttr(st_loc, name, val.substring(1, val.length - 2), match)
286 end
287 end
288
289 redef class Text
290 # Tries to parse the current string to XML
291 #
292 # Returns an `XMLDocument` if successful, or an `XMLError` if not
293 fun to_xml: XMLEntity do return (new XMLProcessor(self.to_s)).parse_document
294 end