Merge: doc: fixed some typos and other misc. corrections
[nit.git] / lib / dom / parser.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # This file is free software, which comes along with NIT. This software is
4 # distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
5 # without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
6 # PARTICULAR PURPOSE. You can modify it is you want, provided this header
7 # is kept unaltered, and a notification of the changes is added.
8 # You are allowed to redistribute it and sell it, alone or is a part of
9 # another product.
10
11 # XML DOM-parsing facilities
12 module parser
13
14 intrude import parser_base
15 intrude import xml_entities
16
17 # Provides XML parsing facilities
18 class XMLProcessor
19 super StringProcessor
20
21 # Parses a full XML document
22 fun parse_document: XMLEntity do
23 var stack = new Array[XMLStartTag]
24 var doc = new XMLDocument
25 loop
26 ignore_whitespaces
27 if pos >= src.length then break
28 if src[pos] == '<' then
29 var tag = read_tag
30 if tag isa XMLStartTag then
31 if stack.is_empty then
32 tag.parent = doc
33 else
34 var st_last = stack.last
35 tag.parent = st_last
36 end
37 stack.push tag
38 else if tag isa XMLEndTag then
39 if stack.is_empty then
40 return new XMLError(tag.location, "Missing matching tag for `{tag.tag_name}`")
41 end
42 var st_last = stack.last
43 if tag.tag_name == st_last.tag_name then
44 st_last.matching = tag
45 tag.matching = st_last
46 stack.pop
47 else
48 var miss = stack.pop
49 return new XMLError(miss.location, "Missing matching tag for `{miss.tag_name}`")
50 end
51 else if tag isa XMLError then
52 return tag
53 else
54 if stack.is_empty then
55 tag.parent = doc
56 else
57 tag.parent = stack.last
58 end
59 end
60 else
61 var st = pos
62 var end_pc = ignore_until("<") - 1
63 var loc = new Location(line, line_offset)
64 var pc = new PCDATA(loc, src.substring(st, end_pc - st + 1).trim)
65 if stack.is_empty then
66 pc.parent = doc
67 else
68 pc.parent = stack.last
69 end
70 end
71 end
72 if not stack.is_empty then
73 var miss = stack.pop
74 return new XMLError(miss.location, "Missing matching tag for `{miss.tag_name}`")
75 end
76 return doc
77 end
78
79 # Reads the tag starting in `src` at current position
80 private fun read_tag: XMLEntity do
81 var st_loc = new Location(line, line_offset)
82 var c = src[pos]
83 if not c == '<' then return new XMLError(st_loc, "Expected start of tag, got `{c}`")
84 pos += 1
85 if pos >= src.length then return new XMLError(st_loc, "Malformed tag")
86 c = src[pos]
87 if c == '!' then
88 # Special tag
89 return read_special_tag(st_loc)
90 else if c == '?' then
91 # Prolog tag
92 return read_prolog_tag(st_loc)
93 else if c == '/' then
94 # End tag
95 return read_end_tag(st_loc)
96 else
97 # Start tag
98 return read_start_tag(st_loc)
99 end
100 end
101
102 # Reads a Special tag (starting with <!)
103 #
104 # In case of error, returns a `XMLError`
105 private fun read_special_tag(st_loc: Location): XMLEntity do
106 var srclen = src.length
107 pos += 1
108 if (pos + 2) >= srclen then return new XMLError(st_loc, "Unexpected EOF on start of Special tag")
109 if src[pos] == '-' and src[pos + 1] == '-' then
110 pos += 2
111 var comst = pos
112 var endcom = ignore_until("-->")
113 if endcom == -1 then return new XMLError(st_loc, "Malformed comment")
114 pos += 3
115 return new XMLCommentTag(st_loc ,src.substring(comst, endcom - comst + 1))
116 end
117 var st = pos
118 if srclen - pos >= 7 then
119 var spe_type = src.substring(pos, 7)
120 if spe_type == "[CDATA[" then
121 pos += 7
122 var cdst = pos
123 var cdend = ignore_until("]]>")
124 pos += 3
125 if pos >= srclen then return new XMLError(st_loc, "Unfinished CDATA block")
126 return new CDATA(st_loc, src.substring(cdst, cdend - cdst))
127 else if spe_type == "DOCTYPE" then
128 pos += 7
129 return parse_doctype(st_loc)
130 end
131 end
132 var end_spec = ignore_until(">")
133 pos += 1
134 return new XMLSpecialTag(st_loc, src.substring(st, end_spec - st))
135 end
136
137 # Parse a Doctype declaration tag
138 private fun parse_doctype(st_loc: Location): XMLEntity do
139 var elemts = new Array[String]
140 var srclen = src.length
141 loop
142 ignore_whitespaces
143 if pos >= srclen then return new XMLError(st_loc, "Malformed doctype")
144 var c = src[pos]
145 # TODO: Properly support intern DOCTYPE definitions
146 if c == '[' then
147 var intern_st = pos
148 var intern_end = ignore_until("]")
149 if intern_end == -1 then return new XMLError(st_loc, "Unfinished internal doctype declaration")
150 pos += 1
151 elemts.push src.substring(intern_st, intern_end - intern_st + 1)
152 continue
153 end
154 var elm_st = pos
155 while pos < srclen and not src[pos].is_whitespace and src[pos] != '>' do pos += 1
156 if pos >= srclen then return new XMLError(st_loc, "Malformed doctype")
157 if pos - elm_st > 1 then
158 var str = src.substring(elm_st, pos - elm_st)
159 elemts.push str
160 end
161 if src[pos] == '>' then
162 pos += 1
163 return new XMLDoctypeTag(st_loc, "DOCTYPE", elemts.join(" "))
164 end
165 end
166 end
167
168 # Reads a Prolog or Processing Instruction tag (starting with <?)
169 #
170 # In case of error, returns a `XMLError`
171 private fun read_prolog_tag(st_loc: Location): XMLEntity do
172 var srclen = src.length
173 pos += 1
174 if pos >= srclen then return new XMLError(st_loc, "Invalid start of prolog")
175 var tag_name = parse_tag_name(['<', '>'])
176 var c = src[pos]
177 if c == '<' or c == '>' then return new XMLError(st_loc ,"Unexpected character `{c}` in prolog declaration")
178 if tag_name == "xml" then
179 var args = parse_args(['?'])
180 for i in args do
181 if i isa BadXMLAttribute then return new XMLError(i.location, i.name)
182 end
183 if src[pos] == '?' then
184 if src[pos + 1] == '>' then
185 pos += 2
186 return new XMLPrologTag(st_loc, tag_name, args)
187 end
188 end
189 else
190 var cont_st = pos
191 var cont_end = ignore_until("?>")
192 if cont_end == -1 then
193 pos += 2
194 return new XMLError(st_loc, "Malformed Processing Instruction tag")
195 end
196 pos += 2
197 return new XMLProcessingInstructionTag(st_loc, tag_name, src.substring(cont_st, cont_end - cont_st))
198 end
199 pos += 1
200 return new XMLError(st_loc, "Malformed prolog tag")
201 end
202
203 # Reads an End tag (starting with </)
204 #
205 # In case of error, returns a `XMLError`
206 private fun read_end_tag(st_loc: Location): XMLEntity do
207 pos += 1
208 var tag_name = parse_tag_name(['<', '>'])
209 ignore_whitespaces
210 if src[pos] == '>' then
211 pos += 1
212 return new XMLEndTag(st_loc, tag_name)
213 end
214 return new XMLError(st_loc, "Bad end tag `{tag_name}`")
215 end
216
217 # Reads a Start tag (starting with <)
218 #
219 # In case of error, returns a `XMLError`
220 private fun read_start_tag(st_loc: Location): XMLEntity do
221 var tag_name = parse_tag_name(['/', '>'])
222 var args = parse_args(['/', '>'])
223 for i in args do
224 if i isa BadXMLAttribute then return new XMLError(i.location, i.name)
225 end
226 if src[pos] == '/' then
227 if src[pos + 1] == '>' then
228 pos += 2
229 return new XMLOnelinerTag(st_loc, tag_name, args)
230 end
231 end
232 pos += 1
233 return new XMLStartTag(st_loc, tag_name, args)
234 end
235
236 # Parses an xml tag name
237 private fun parse_tag_name(delims: Array[Char]): String do
238 var idst = pos
239 var srclen = src.length
240 while pos < srclen do
241 var c = src[pos]
242 if c.is_whitespace or delims.has(c) then break
243 pos += 1
244 end
245 return src.substring(idst, pos - idst).trim
246 end
247
248 # Parse the arguments of a tag
249 private fun parse_args(endtags: Array[Char]): Array[XMLAttribute] do
250 var attrs = new Array[XMLAttribute]
251 loop
252 var arg = parse_arg(endtags)
253 if arg isa XMLAttributeEnd then return attrs
254 attrs.add arg
255 if arg isa BadXMLAttribute then return attrs
256 end
257 end
258
259 # Parses the next argument in `src`
260 private fun parse_arg(endtags: Array[Char]): XMLAttribute do
261 var srclen = src.length
262 ignore_whitespaces
263 var st_loc = new Location(line, line_offset)
264 if pos >= srclen then return new BadXMLAttribute(st_loc, "Unfinished attribute name")
265 # FIXME: Ugly, but as long as it remains private, it is OK I guess
266 if endtags.has(src[pos]) then return new XMLAttributeEnd(st_loc, "")
267 var attrname_st = pos
268 while pos < srclen and src[pos] != '=' and not endtags.has(src[pos]) do pos += 1
269 if pos >= srclen then return new BadXMLAttribute(st_loc, "Unfinished attribute name")
270 if src[pos] != '=' then return new BadXMLAttribute(st_loc, "Malformed attribute")
271 var attrname_end = pos - 1
272 var name = src.substring(attrname_st, attrname_end - attrname_st + 1).trim
273 pos += 1
274 ignore_whitespaces
275 var attrval_st = pos
276 if pos >= srclen then return new BadXMLAttribute(st_loc, "Unfinished attribute `{name}`")
277 var match = src[pos]
278 if match != '\'' and match != '"' then return new BadXMLAttribute(st_loc, "Invalid string delimiter `{match}` for attribute `{name}`")
279 pos += 1
280 while pos < srclen and src[pos] != match do pos += 1
281 if pos >= srclen then return new BadXMLAttribute(st_loc, "Unfinished attribute `{name}`")
282 var attrval_end = pos
283 var val = src.substring(attrval_st, attrval_end - attrval_st + 1).trim
284 pos += 1
285 return new XMLStringAttr(st_loc, name, val.substring(1, val.length - 2), match)
286 end
287 end
288
289 redef class Text
290 # Tries to parse the current string to XML
291 #
292 # Returns an `XMLDocument` if successful, or an `XMLError` if not
293 fun to_xml: XMLEntity do return (new XMLProcessor(self.to_s)).parse_document
294 end