Merge: Attributes access in nitvm
[nit.git] / lib / saxophonit / saxophonit.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # This file is free software, which comes along with NIT. This software is
4 # distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
5 # without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
6 # PARTICULAR PURPOSE. You can modify it is you want, provided this header
7 # is kept unaltered, and a notification of the changes is added.
8 # You are allowed to redistribute it and sell it, alone or is a part of
9 # another product.
10
11 # A SAX 2 parser in Nit.
12 module saxophonit
13
14 import sax
15 intrude import standard::file
16 private import reader_model
17 private import lexer
18
19 # Implementation of the `XMLReader` interface.
20 #
21 # For the moment, only XML 1.0 is (partially) supported.
22 #
23 # The following mandatory features of XML 1.0 are not yet supported:
24 #
25 # * Parsing of entities (files) encoded in UTF-16.
26 # * Encoding handling.
27 # * Entity references resolving (except for built-in references).
28 # * Handling of the options specified in the XML declaration.
29 # * Parsing of a `DOCTYPE` declaration.
30 #
31 # Also note that this XML processor is unable to retrieve a file from an URL
32 # (only local paths are supported).
33 #
34 # Usage example:
35 #
36 # # Retrieve all text nodes.
37 # class TextListener
38 # super ContentHandler
39 # #
40 # private var buf: Buffer = new FlatBuffer
41 # private var sp: Bool = false
42 # #
43 # redef fun characters(str: String) do
44 # if sp then
45 # if buf.length > 0 then buf.append(" ")
46 # sp = false
47 # end
48 # buf.append(str)
49 # end
50 # #
51 # redef fun ignorable_whitespace(str: String) do
52 # sp = true
53 # end
54 # #
55 # # Return the concatenation of all text nodes.
56 # redef fun to_s do return buf.to_s
57 # end
58 # #
59 # var text = new TextListener
60 # var reader = new XophonReader
61 # #
62 # reader.content_handler = text
63 # reader.parse(new InputSource.with_stream(new StringIStream("<foo>bar baz <n>42</n>.</foo>")))
64 # assert text.to_s == "bar baz 42."
65 class XophonReader
66 super XMLReader
67
68 private var model = new XophonReaderModel
69 private var lexer: XophonLexer is noinit
70
71 redef fun entity_resolver: nullable EntityResolver do return model.entity_resolver
72 redef fun entity_resolver=(entity_resolver: nullable EntityResolver) do
73 model.entity_resolver = entity_resolver
74 end
75
76 redef fun dtd_handler: nullable DTDHandler do return model.dtd_handler
77 redef fun dtd_handler=(dtd_handler: nullable DTDHandler) do
78 model.dtd_handler = dtd_handler
79 end
80
81 redef fun content_handler: nullable ContentHandler do return model.content_handler
82 redef fun content_handler=(content_handler: nullable ContentHandler) do
83 model.content_handler = content_handler
84 end
85
86 redef fun error_handler: nullable ErrorHandler do return model.error_handler
87 redef fun error_handler=(error_handler: nullable ErrorHandler) do
88 model.error_handler = error_handler
89 end
90
91
92 redef fun feature_recognized(name: String): Bool do
93 return model.feature_recognized(name)
94 end
95
96 redef fun feature_readable(name: String): Bool do
97 return model.feature_readable(name)
98 end
99
100 redef fun feature_writable(name: String): Bool do
101 return model.feature_readable(name)
102 end
103
104 redef fun feature(name: String): Bool do return model.feature(name)
105 redef fun feature=(name: String, value: Bool) do model.feature(name) = value
106
107 redef fun property_recognized(name: String): Bool do
108 return model.property_recognized(name)
109 end
110
111 redef fun property_readable(name: String): Bool do
112 return model.property_readable(name)
113 end
114
115 redef fun property_writable(name: String): Bool do
116 return model.property_writable(name)
117 end
118
119 redef fun property(name: String): nullable Object do
120 return model.property(name)
121 end
122
123 redef fun property=(name: String, value: nullable Object) do
124 model.property(name) = value
125 end
126
127 redef fun parse(input: InputSource) do
128 var stream: IStream
129 var system_id: nullable MaybeError[String, Error] = null
130 model.locator = new SAXLocatorImpl
131
132 if input.system_id != null then
133 system_id = resolve_system_id(input.system_id.as(not null))
134 if system_id.is_error then
135 model.fire_warning(system_id.error.message, system_id.error)
136 else
137 model.locator.system_id = system_id.value
138 end
139 end
140 model.locator.public_id = input.public_id
141 # TODO: encoding
142
143 if input.stream != null then
144 lexer = new XophonLexer(model, input.stream.as(not null))
145 parse_main
146 else if system_id != null then
147 if system_id.is_error then
148 model.fire_fatal_error("File <{input.system_id.as(not null)}> not found.", null)
149 else
150 lexer = new XophonLexer(model,
151 new IFStream.open(system_id.value))
152 parse_main
153 lexer.close
154 end
155 else
156 model.fire_fatal_error("At least a stream or a system identifier must be specified. None given.",
157 null)
158 end
159 end
160
161 redef fun parse_file(system_id: String) do
162 parse(new InputSource.with_system_id(system_id))
163 end
164
165
166 ############################################################################
167 # Parsing
168
169 # Note: Every `expect_*` function (except `parse_main`) does not call
170 # `read_char` for the first byte and let the byte just after its production
171 # in `last_char` (except in case of fatal error). They return `false` on
172 # fatal error and at the end of the file.
173
174 # Parse the main entity.
175 private fun parse_main do
176 model.fire_document_locator
177 model.fire_start_document
178 lexer.start
179 expect_document
180 model.fire_end_document
181 end
182
183 # Expect a `document` production.
184 private fun expect_document: Bool do
185 var success = true
186 var got_doctype = false
187 var got_element = false
188
189 # If the document start with `<`, it may start with a XML declaration,
190 # a processing instruction, a comment, a `DOCTYPE` declaration, the
191 # root element or a white space.
192 if lexer.accept('<') then
193 if lexer.accept('?') then
194 if not expect_pi_or_xml_decl then return false
195 else if lexer.accept('!') then
196 if lexer.accept('-') then
197 if not lexer.expect('-',
198 " at the beginning of a comment") or
199 not expect_comment then
200 return false
201 end
202 else
203 if not expect_doctype_decl then return false
204 got_doctype = true
205 end
206 else
207 if not expect_root then return false
208 # The `DOCTYPE` declaration *must* come before the root
209 # element.
210 got_doctype = true
211 got_element = true
212 end
213 else if not lexer.accept_s then
214 return lexer.fire_unexpected_char(
215 ". Expecting a white space or `<`")
216 end
217
218 # After the XML declaration (if there is one), the document may contain
219 # processing instructions, comments, the `DOCTYPE` declaration and
220 # the root element.
221 # These productions may be separated by white space.
222 while not got_element do
223 if lexer.accept('<') then
224 if lexer.accept('?') then
225 if not expect_pi then return false
226 else if lexer.accept('!') then
227 if lexer.accept('-') then
228 if not lexer.expect('-',
229 " at the beginning of a comment") or
230 not expect_comment then
231 return false
232 end
233 else if got_doctype then
234 return lexer.fire_unexpected_char(". Expecting `-`")
235 else if expect_doctype_decl then
236 got_doctype = true
237 else
238 return false
239 end
240 else
241 if not expect_root then return false
242 # The `DOCTYPE` declaration *must* come before the root
243 # element.
244 got_doctype = true
245 got_element = true
246 end
247 else if not lexer.accept_s then
248 return lexer.fire_unexpected_char(
249 ". Expecting a white space or `<`")
250 end
251 end
252 return expect_miscs
253 end
254
255 private fun expect_doctype_decl: Bool do
256 return model.fire_fatal_error("DTD not supported yet.\n", null) # TODO
257 end
258
259 # Expect the root `element` production, without the first `<` token.
260 private fun expect_root: Bool do
261 var success = true
262 var char_data = new FlatBuffer
263
264 success = expect_stag
265 while success and not lexer.eof and not model.root_closed do
266 success = expect_content_chunk(char_data)
267 end
268 if success then
269 success = model.expect_root_closed
270 end
271 flush(char_data)
272 return success
273 end
274
275 # Parse a `EmptyElemTag | STag | ETag | Reference | CDSect | PI | Comment | CharData` production.
276 #
277 # If the last read byte matches the `CharData` production, push the char in
278 # `char_data`. Else, flush `CharData` as a `characters` event.
279 private fun expect_content_chunk(char_data: Buffer): Bool do
280 if lexer.accept('<') then
281 flush(char_data)
282 if lexer.accept('!') then
283 if lexer.accept('-') then
284 return lexer.expect('-',
285 " at the beginning of a comment") and
286 expect_comment
287 else if lexer.accept('[') then
288 return expect_cd_sect
289 else
290 return lexer.fire_unexpected_char(
291 ". Expecting `--` or `[CDATA[`")
292 end
293 else if lexer.accept('?') then
294 return expect_pi
295 else if lexer.accept('/') then
296 return expect_etag
297 else
298 return expect_stag
299 end
300 else if lexer.accept('&') then
301 flush(char_data)
302 var success = expect_reference(char_data)
303 flush(char_data)
304 return success
305 else
306 return lexer.expect_xml_char(char_data)
307 end
308 end
309
310 # Expect a `EmptyElemTag | STag` production, without the initial `<`.
311 private fun expect_stag: Bool do
312 var name_buffer = new FlatBuffer
313
314 if lexer.expect_name(name_buffer) then
315 var name = name_buffer.to_s
316
317 model.fire_start_attributes
318 loop
319 if lexer.accept('>') then
320 model.fire_start_element(name)
321 return true
322 else if lexer.accept('/') then
323 if lexer.expect('>', "") then
324 model.fire_start_element(name)
325 model.fire_end_element(name)
326 return true
327 else
328 return false
329 end
330 else if lexer.expect_s then
331 if lexer.accept('/') then
332 if lexer.expect('>', "") then
333 model.fire_start_element(name)
334 model.fire_end_element(name)
335 return true
336 else
337 return false
338 end
339 else if lexer.accept('>') then
340 model.fire_start_element(name)
341 return true
342 else if not expect_attribute then
343 return false
344 end
345 else
346 return lexer.fire_unexpected_char(" in tag. " +
347 "Expecting an attribute, `/`, `>` or white space")
348 end
349 end
350 end
351 return false
352 end
353
354 # Expect a `ETag` production, without the initial `</`.
355 private fun expect_etag: Bool do
356 var name_buf = new FlatBuffer
357
358 if lexer.expect_name(name_buf) and
359 lexer.skip_s and
360 lexer.expect('>', "") then
361 return model.fire_end_element(name_buf.to_s)
362 else
363 return false
364 end
365 end
366
367 # Expect an `Attributes` production.
368 private fun expect_attribute: Bool do
369 var name = new FlatBuffer
370 var value = new FlatBuffer
371
372 if lexer.expect_name(name) and
373 lexer.expect_eq and
374 expect_att_value(value) then
375 model.fire_attribute(name.to_s, value.to_s)
376 return true
377 else
378 return false
379 end
380 end
381
382 # Expect the `Misc*` production at the end of a document.
383 private fun expect_miscs: Bool do
384 while not lexer.eof do
385 if lexer.accept('<') then
386 if lexer.accept('?') then
387 if not expect_pi then return false
388 else if lexer.accept('!') then
389 if not lexer.expect_string("--",
390 " at the beginning of a comment") or
391 not expect_comment then
392 return false
393 end
394 else
395 return lexer.fire_unexpected_char(". Expecting `?` or `!`")
396 end
397 else if not lexer.accept_s then
398 return lexer.fire_unexpected_char(
399 ". Expecting a white space or `<`")
400 end
401 end
402 return true
403 end
404
405 # Expect a `AttValue` production.
406 #
407 # Append the parsed value to `buffer`.
408 private fun expect_att_value(buffer: Buffer): Bool do
409 var delimiter = lexer.expect_delimiter
410
411 if delimiter < 0 then return false
412 loop
413 if lexer.accept_int(delimiter) then
414 return true
415 else if lexer.accept('&') then
416 # TODO: [WFC: No < in Attribute Values]
417 if not expect_reference(buffer) then return false
418 else if not lexer.expect_att_value_char(buffer) then
419 return false
420 end
421 end
422 end
423
424 # Expect a `SystemLiteral` production.
425 #
426 # Also used to parse productions that do not have references.
427 # Append the parsed value to `buffer`.
428 private fun expect_literal(buffer: Buffer): Bool do
429 var delimiter = lexer.expect_delimiter
430
431 if delimiter < 0 then return false
432 loop
433 if lexer.accept_int(delimiter) then
434 return true
435 else if not lexer.expect_xml_char(buffer) then
436 return false
437 end
438 end
439 end
440
441
442 # Expect a `Comment` production, without the beginning.
443 #
444 # Assume `last_char` is the fifth byte of the production that is, the
445 # next byte after the `'<!--'` token.
446 private fun expect_comment: Bool do
447 var buffer: Buffer = new FlatBuffer
448
449 loop
450 if lexer.accept('-') then
451 if lexer.accept('-') then
452 if not lexer.expect('>',
453 " after a double-hyphen (`--`) in a comment") then
454 return false
455 else
456 break
457 end
458 else
459 buffer.chars.push('-')
460 if not lexer.expect_xml_char(buffer) then return false
461 end
462 else if not lexer.expect_xml_char(buffer) then
463 return false
464 end
465 end
466 model.fire_comment(buffer.to_s)
467 return true
468 end
469
470 # Expect a `PI` production, without the beginning.
471 #
472 # Assume `last_char` is the third byte of the production that is, the
473 # next byte after the `'<?'` token.
474 private fun expect_pi: Bool do
475 var target = new FlatBuffer
476
477 return lexer.expect_pi_target(target) and
478 expect_pi_data(target.to_s)
479 end
480
481 # Expect the data part and the `'?>'` token of a `PI` production.
482 private fun expect_pi_data(target: String): Bool do
483 if lexer.accept('?') then
484 if lexer.expect('>', " at the end of a processing instruction") then
485 model.fire_processing_instruction(target, null)
486 return true
487 else
488 return false
489 end
490 else if lexer.accept_s then
491 var data: Buffer = new FlatBuffer
492
493 loop
494 if lexer.accept('?') then
495 if lexer.accept('>') then
496 break
497 else
498 data.chars.push('?')
499 if not lexer.expect_xml_char(data) then return false
500 end
501 else if not lexer.expect_xml_char(data) then
502 return false
503 end
504 end
505 model.fire_processing_instruction(target, data.to_s)
506 return true
507 else
508 return lexer.fire_unexpected_char(" after a processing " +
509 "instruction target. Expecting a white space or `?>`")
510 end
511 end
512
513 # Expect a `PI | XMLDecl` production, without the beginning.
514 #
515 # Assume `last_char` is the third byte of the production that is, the
516 # next byte after the `'<?'` token.
517 private fun expect_pi_or_xml_decl: Bool do
518 var buffer: Buffer = new FlatBuffer
519
520 if lexer.expect_name(buffer) then
521 var target = buffer.to_s
522
523 if target == "xml" then
524 return expect_xml_decl
525 else if lexer.check_pi_target(target) then
526 return expect_pi_data(target)
527 else
528 return false
529 end
530 else
531 return false
532 end
533 end
534
535 # Expect a `XMLDecl` production, without the initial `<?xml` token.
536 private fun expect_xml_decl: Bool do
537 if not expect_version_info then return false
538 if lexer.accept_s then
539 if lexer.is_char('e') then
540 if not expect_encoding_decl then return false
541 # At this point, we can only accept `S` or `'?>'`.
542 if not lexer.accept_s then
543 return lexer.expect_string("?>", "")
544 end
545 end
546 if lexer.is_char('s') and not expect_sd_decl then return false
547 return lexer.skip_s and lexer.expect_string("?>", "")
548 else
549 return lexer.expect_string("?>", "")
550 end
551 end
552
553 # Expect a `EncodingDecl` token, without the initial `S` token.
554 private fun expect_encoding_decl: Bool do
555 var encoding = new FlatBuffer
556
557 if not lexer.expect_string("encoding", "") or not lexer.expect_eq or
558 not expect_literal(encoding) then
559 return false
560 end
561 if not encoding.has("^[A-Za-z][A-Za-z0-9._-]*$".to_re) then
562 return model.fire_fatal_error("`{encoding.to_s}` is not a valid " +
563 "encoding name.", null)
564 end
565 # TODO: Do something with the value.
566 return true
567 end
568
569 # Expect a `SDDecl` token, without the initial `S` token.
570 private fun expect_sd_decl: Bool do
571 var buf = new FlatBuffer
572 var value: String
573
574 if not lexer.expect_string("standalone", "") or not lexer.expect_eq or
575 not expect_literal(buf) then
576 return false
577 end
578 value = buf.to_s
579 if not value == "yes" and not value == "no" then
580 return model.fire_fatal_error("`{value}` is not a valid value for " +
581 "the `standalone` declaration. Expecting `yes` or `no`.",
582 null)
583 end
584 # TODO: Do something with the value.
585 return true
586 end
587
588 # Expect a `CDSect` production, without the beginning.
589 #
590 # Assume `last_char` is the fourth byte of the production that is, the
591 # next byte after the `'<!['` token.
592 private fun expect_cd_sect: Bool do
593 var buffer: Buffer = new FlatBuffer
594
595 # Number of consecutive closing brackets.
596 var closing: Int = 0
597
598 if lexer.expect_string("CDATA[",
599 " at the beginning of a CDATA section.") then
600 model.fire_start_cdata
601 loop
602 if lexer.accept(']') then
603 closing += 1
604 else
605 for i in [0..closing[ do
606 buffer.chars.push(']')
607 end
608 closing = 0
609 if closing >= 2 and lexer.accept('>') then break
610 if not lexer.expect_xml_char(buffer) then return false
611 end
612 end
613 flush(buffer)
614 model.fire_end_cdata
615 return true
616 else
617 return false
618 end
619 end
620
621 # Expect a `VersionInfo` production.
622 private fun expect_version_info: Bool do
623 if not lexer.expect_s or
624 not lexer.expect_string("version",
625 " in the first attribute name of the XML declaration") or
626 not lexer.expect_eq then
627 return false
628 else
629 var minor: Buffer = new FlatBuffer
630 var delimiter = lexer.expect_delimiter
631
632 if delimiter < 0 then return false
633 if not lexer.expect_string("1.", " as XML major version") or
634 not lexer.expect_digits(minor) or
635 not lexer.expect_int(delimiter, "") then
636 return false
637 end
638 if minor.to_s != "0" then
639 model.fire_warning("Only XML 1.0 is supported. " +
640 "Got a XML 1.{minor.to_s} document.", null)
641 end
642 return true
643 end
644 end
645
646 # Expect a `Reference`, without the initial `&`.
647 #
648 # Append the value to the buffer.
649 private fun expect_reference(buffer: Buffer): Bool do
650 # TODO: [WFC: Entity Declared]
651 # TODO: [VC: Entity Declared]
652 # TODO: [WFC: Parsed Entity]
653 # TODO: [WFC: No Recursion]
654 # TODO: Unicode
655
656 var ref = new FlatBuffer
657
658 if lexer.accept('#') then
659 if lexer.accept('x') then
660 if lexer.expect_hex(ref) then
661 buffer.chars.add(ref.to_hex.ascii)
662 return lexer.expect(';', "")
663 else
664 return lexer.fire_unexpected_char(
665 ". Expecting an hexadecimal digit")
666 end
667 else if lexer.accept_digits(ref) then
668 buffer.chars.add(ref.to_i.ascii)
669 return lexer.expect(';', "")
670 else
671 return lexer.fire_unexpected_char(" in a character reference. " +
672 "Expecting `x` or a decimal digit")
673 end
674 else if lexer.expect_name(ref) then
675 var name = ref.to_s
676 if name.has(":") then
677 model.fire_error("The entity name `{name}` contains a colon.", null)
678 end
679 var value = resolve_reference(name)
680
681 if value != null then
682 buffer.append(value)
683 return lexer.expect(';', "")
684 else
685 model.fire_fatal_error("Unknown entity `{name}`.", null)
686 return false
687 end
688 else
689 return lexer.fire_unexpected_char(
690 " in a reference. Expecting `#` or a name")
691 end
692 end
693
694 # Resolve the entity reference or return `null`.
695 private fun resolve_reference(name: String): nullable String do
696 if name == "lt" then
697 return "<"
698 else if name == "gt" then
699 return ">"
700 else if name == "amp" then
701 return "&"
702 else if name == "quot" then
703 return "\""
704 else if name == "apos" then
705 return "'"
706 else
707 return null
708 end
709 # TODO: Support non-builtin entities
710 end
711
712 # Flush the specified buffer as a `characters` event.
713 #
714 # Do nothing if `buffer` is empty.
715 private fun flush(buffer: Buffer) do
716 if buffer.length > 0 then
717 model.fire_characters(buffer.to_s)
718 buffer.clear
719 end
720 end
721
722
723 ############################################################################
724 # Paths
725
726 # Resolve the specified system id.
727 private fun resolve_system_id(system_id: String): MaybeError[String, Error] do
728 return realpath(system_id)
729 # TODO: handle URIs
730 end
731
732 # Resolve the specified POSIX path.
733 #
734 # Like `String.realpath`, but with error handling.
735 private fun realpath(path: String): MaybeError[String, Error] do
736 var cs = path.to_cstring.file_realpath
737
738 if cs.address_is_null then
739 return new MaybeError[String, Error](null,
740 new Error("File <{path}> not found."))
741 else
742 return new MaybeError[String, Error](cs.to_s, null)
743 end
744 end
745 end