lib/standard/stream: Renamed streams for more explicit denomination
[nit.git] / lib / saxophonit / saxophonit.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # This file is free software, which comes along with NIT. This software is
4 # distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
5 # without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
6 # PARTICULAR PURPOSE. You can modify it is you want, provided this header
7 # is kept unaltered, and a notification of the changes is added.
8 # You are allowed to redistribute it and sell it, alone or is a part of
9 # another product.
10
11 # A SAX 2 parser in Nit.
12 module saxophonit
13
14 import sax
15 intrude import standard::file
16 private import reader_model
17 private import lexer
18
19 # Implementation of the `XMLReader` interface.
20 #
21 # For the moment, only XML 1.0 is (partially) supported.
22 #
23 # The following mandatory features of XML 1.0 are not yet supported:
24 #
25 # * Parsing of entities (files) encoded in UTF-16.
26 # * Encoding handling.
27 # * Entity references resolving (except for built-in references).
28 # * Handling of the options specified in the XML declaration.
29 # * Parsing of a `DOCTYPE` declaration.
30 #
31 # Also note that this XML processor is unable to retrieve a file from an URL
32 # (only local paths are supported).
33 #
34 # Usage example:
35 #
36 # # Retrieve all text nodes.
37 # class TextListener
38 # super ContentHandler
39 # #
40 # private var buf: Buffer = new FlatBuffer
41 # private var sp: Bool = false
42 # #
43 # redef fun characters(str: String) do
44 # if sp then
45 # if buf.length > 0 then buf.append(" ")
46 # sp = false
47 # end
48 # buf.append(str)
49 # end
50 # #
51 # redef fun ignorable_whitespace(str: String) do
52 # sp = true
53 # end
54 # #
55 # # Return the concatenation of all text nodes.
56 # redef fun to_s do return buf.to_s
57 # end
58 # #
59 # var text = new TextListener
60 # var reader = new XophonReader
61 # #
62 # reader.content_handler = text
63 # reader.parse(new InputSource.with_stream(new StringReader("<foo>bar baz <n>42</n>.</foo>")))
64 # assert text.to_s == "bar baz 42."
65 class XophonReader
66 super XMLReader
67
68 private var model = new XophonReaderModel
69 private var lexer: XophonLexer is noinit
70
71 redef fun entity_resolver: nullable EntityResolver do return model.entity_resolver
72 redef fun entity_resolver=(entity_resolver: nullable EntityResolver) do
73 model.entity_resolver = entity_resolver
74 end
75
76 redef fun dtd_handler: nullable DTDHandler do return model.dtd_handler
77 redef fun dtd_handler=(dtd_handler: nullable DTDHandler) do
78 model.dtd_handler = dtd_handler
79 end
80
81 redef fun content_handler: nullable ContentHandler do return model.content_handler
82 redef fun content_handler=(content_handler: nullable ContentHandler) do
83 model.content_handler = content_handler
84 end
85
86 redef fun error_handler: nullable ErrorHandler do return model.error_handler
87 redef fun error_handler=(error_handler: nullable ErrorHandler) do
88 model.error_handler = error_handler
89 end
90
91
92 redef fun feature_recognized(name: String): Bool do
93 return model.feature_recognized(name)
94 end
95
96 redef fun feature_readable(name: String): Bool do
97 return model.feature_readable(name)
98 end
99
100 redef fun feature_writable(name: String): Bool do
101 return model.feature_readable(name)
102 end
103
104 redef fun feature(name: String): Bool do return model.feature(name)
105 redef fun feature=(name: String, value: Bool) do model.feature(name) = value
106
107 redef fun property_recognized(name: String): Bool do
108 return model.property_recognized(name)
109 end
110
111 redef fun property_readable(name: String): Bool do
112 return model.property_readable(name)
113 end
114
115 redef fun property_writable(name: String): Bool do
116 return model.property_writable(name)
117 end
118
119 redef fun property(name: String): nullable Object do
120 return model.property(name)
121 end
122
123 redef fun property=(name: String, value: nullable Object) do
124 model.property(name) = value
125 end
126
127 redef fun parse(input: InputSource) do
128 var system_id: nullable MaybeError[String, Error] = null
129 model.locator = new SAXLocatorImpl
130
131 if input.system_id != null then
132 system_id = resolve_system_id(input.system_id.as(not null))
133 if system_id.is_error then
134 model.fire_warning(system_id.error.message, system_id.error)
135 else
136 model.locator.system_id = system_id.value
137 end
138 end
139 model.locator.public_id = input.public_id
140 # TODO: encoding
141
142 if input.stream != null then
143 lexer = new XophonLexer(model, input.stream.as(not null))
144 parse_main
145 else if system_id != null then
146 if system_id.is_error then
147 model.fire_fatal_error("File <{input.system_id.as(not null)}> not found.", null)
148 else
149 lexer = new XophonLexer(model,
150 new FileReader.open(system_id.value))
151 parse_main
152 lexer.close
153 end
154 else
155 model.fire_fatal_error("At least a stream or a system identifier must be specified. None given.",
156 null)
157 end
158 end
159
160 redef fun parse_file(system_id: String) do
161 parse(new InputSource.with_system_id(system_id))
162 end
163
164
165 ############################################################################
166 # Parsing
167
168 # Note: Every `expect_*` function (except `parse_main`) does not call
169 # `read_char` for the first byte and let the byte just after its production
170 # in `last_char` (except in case of fatal error). They return `false` on
171 # fatal error and at the end of the file.
172
173 # Parse the main entity.
174 private fun parse_main do
175 model.fire_document_locator
176 model.fire_start_document
177 lexer.start
178 expect_document
179 model.fire_end_document
180 end
181
182 # Expect a `document` production.
183 private fun expect_document: Bool do
184 var got_doctype = false
185 var got_element = false
186
187 # If the document start with `<`, it may start with a XML declaration,
188 # a processing instruction, a comment, a `DOCTYPE` declaration, the
189 # root element or a white space.
190 if lexer.accept('<') then
191 if lexer.accept('?') then
192 if not expect_pi_or_xml_decl then return false
193 else if lexer.accept('!') then
194 if lexer.accept('-') then
195 if not lexer.expect('-',
196 " at the beginning of a comment") or
197 not expect_comment then
198 return false
199 end
200 else
201 if not expect_doctype_decl then return false
202 got_doctype = true
203 end
204 else
205 if not expect_root then return false
206 # The `DOCTYPE` declaration *must* come before the root
207 # element.
208 got_doctype = true
209 got_element = true
210 end
211 else if not lexer.accept_s then
212 return lexer.fire_unexpected_char(
213 ". Expecting a white space or `<`")
214 end
215
216 # After the XML declaration (if there is one), the document may contain
217 # processing instructions, comments, the `DOCTYPE` declaration and
218 # the root element.
219 # These productions may be separated by white space.
220 while not got_element do
221 if lexer.accept('<') then
222 if lexer.accept('?') then
223 if not expect_pi then return false
224 else if lexer.accept('!') then
225 if lexer.accept('-') then
226 if not lexer.expect('-',
227 " at the beginning of a comment") or
228 not expect_comment then
229 return false
230 end
231 else if got_doctype then
232 return lexer.fire_unexpected_char(". Expecting `-`")
233 else if expect_doctype_decl then
234 got_doctype = true
235 else
236 return false
237 end
238 else
239 if not expect_root then return false
240 # The `DOCTYPE` declaration *must* come before the root
241 # element.
242 got_doctype = true
243 got_element = true
244 end
245 else if not lexer.accept_s then
246 return lexer.fire_unexpected_char(
247 ". Expecting a white space or `<`")
248 end
249 end
250 return expect_miscs
251 end
252
253 private fun expect_doctype_decl: Bool do
254 return model.fire_fatal_error("DTD not supported yet.\n", null) # TODO
255 end
256
257 # Expect the root `element` production, without the first `<` token.
258 private fun expect_root: Bool do
259 var success = true
260 var char_data = new FlatBuffer
261
262 success = expect_stag
263 while success and not lexer.eof and not model.root_closed do
264 success = expect_content_chunk(char_data)
265 end
266 if success then
267 success = model.expect_root_closed
268 end
269 flush(char_data)
270 return success
271 end
272
273 # Parse a `EmptyElemTag | STag | ETag | Reference | CDSect | PI | Comment | CharData` production.
274 #
275 # If the last read byte matches the `CharData` production, push the char in
276 # `char_data`. Else, flush `CharData` as a `characters` event.
277 private fun expect_content_chunk(char_data: Buffer): Bool do
278 if lexer.accept('<') then
279 flush(char_data)
280 if lexer.accept('!') then
281 if lexer.accept('-') then
282 return lexer.expect('-',
283 " at the beginning of a comment") and
284 expect_comment
285 else if lexer.accept('[') then
286 return expect_cd_sect
287 else
288 return lexer.fire_unexpected_char(
289 ". Expecting `--` or `[CDATA[`")
290 end
291 else if lexer.accept('?') then
292 return expect_pi
293 else if lexer.accept('/') then
294 return expect_etag
295 else
296 return expect_stag
297 end
298 else if lexer.accept('&') then
299 flush(char_data)
300 var success = expect_reference(char_data)
301 flush(char_data)
302 return success
303 else
304 return lexer.expect_xml_char(char_data)
305 end
306 end
307
308 # Expect a `EmptyElemTag | STag` production, without the initial `<`.
309 private fun expect_stag: Bool do
310 var name_buffer = new FlatBuffer
311
312 if lexer.expect_name(name_buffer) then
313 var name = name_buffer.to_s
314
315 model.fire_start_attributes
316 loop
317 if lexer.accept('>') then
318 model.fire_start_element(name)
319 return true
320 else if lexer.accept('/') then
321 if lexer.expect('>', "") then
322 model.fire_start_element(name)
323 model.fire_end_element(name)
324 return true
325 else
326 return false
327 end
328 else if lexer.expect_s then
329 if lexer.accept('/') then
330 if lexer.expect('>', "") then
331 model.fire_start_element(name)
332 model.fire_end_element(name)
333 return true
334 else
335 return false
336 end
337 else if lexer.accept('>') then
338 model.fire_start_element(name)
339 return true
340 else if not expect_attribute then
341 return false
342 end
343 else
344 return lexer.fire_unexpected_char(" in tag. " +
345 "Expecting an attribute, `/`, `>` or white space")
346 end
347 end
348 end
349 return false
350 end
351
352 # Expect a `ETag` production, without the initial `</`.
353 private fun expect_etag: Bool do
354 var name_buf = new FlatBuffer
355
356 if lexer.expect_name(name_buf) and
357 lexer.skip_s and
358 lexer.expect('>', "") then
359 return model.fire_end_element(name_buf.to_s)
360 else
361 return false
362 end
363 end
364
365 # Expect an `Attributes` production.
366 private fun expect_attribute: Bool do
367 var name = new FlatBuffer
368 var value = new FlatBuffer
369
370 if lexer.expect_name(name) and
371 lexer.expect_eq and
372 expect_att_value(value) then
373 model.fire_attribute(name.to_s, value.to_s)
374 return true
375 else
376 return false
377 end
378 end
379
380 # Expect the `Misc*` production at the end of a document.
381 private fun expect_miscs: Bool do
382 while not lexer.eof do
383 if lexer.accept('<') then
384 if lexer.accept('?') then
385 if not expect_pi then return false
386 else if lexer.accept('!') then
387 if not lexer.expect_string("--",
388 " at the beginning of a comment") or
389 not expect_comment then
390 return false
391 end
392 else
393 return lexer.fire_unexpected_char(". Expecting `?` or `!`")
394 end
395 else if not lexer.accept_s then
396 return lexer.fire_unexpected_char(
397 ". Expecting a white space or `<`")
398 end
399 end
400 return true
401 end
402
403 # Expect a `AttValue` production.
404 #
405 # Append the parsed value to `buffer`.
406 private fun expect_att_value(buffer: Buffer): Bool do
407 var delimiter = lexer.expect_delimiter
408
409 if delimiter < 0 then return false
410 loop
411 if lexer.accept_int(delimiter) then
412 return true
413 else if lexer.accept('&') then
414 # TODO: [WFC: No < in Attribute Values]
415 if not expect_reference(buffer) then return false
416 else if not lexer.expect_att_value_char(buffer) then
417 return false
418 end
419 end
420 end
421
422 # Expect a `SystemLiteral` production.
423 #
424 # Also used to parse productions that do not have references.
425 # Append the parsed value to `buffer`.
426 private fun expect_literal(buffer: Buffer): Bool do
427 var delimiter = lexer.expect_delimiter
428
429 if delimiter < 0 then return false
430 loop
431 if lexer.accept_int(delimiter) then
432 return true
433 else if not lexer.expect_xml_char(buffer) then
434 return false
435 end
436 end
437 end
438
439
440 # Expect a `Comment` production, without the beginning.
441 #
442 # Assume `last_char` is the fifth byte of the production that is, the
443 # next byte after the `'<!--'` token.
444 private fun expect_comment: Bool do
445 var buffer: Buffer = new FlatBuffer
446
447 loop
448 if lexer.accept('-') then
449 if lexer.accept('-') then
450 if not lexer.expect('>',
451 " after a double-hyphen (`--`) in a comment") then
452 return false
453 else
454 break
455 end
456 else
457 buffer.chars.push('-')
458 if not lexer.expect_xml_char(buffer) then return false
459 end
460 else if not lexer.expect_xml_char(buffer) then
461 return false
462 end
463 end
464 model.fire_comment(buffer.to_s)
465 return true
466 end
467
468 # Expect a `PI` production, without the beginning.
469 #
470 # Assume `last_char` is the third byte of the production that is, the
471 # next byte after the `'<?'` token.
472 private fun expect_pi: Bool do
473 var target = new FlatBuffer
474
475 return lexer.expect_pi_target(target) and
476 expect_pi_data(target.to_s)
477 end
478
479 # Expect the data part and the `'?>'` token of a `PI` production.
480 private fun expect_pi_data(target: String): Bool do
481 if lexer.accept('?') then
482 if lexer.expect('>', " at the end of a processing instruction") then
483 model.fire_processing_instruction(target, null)
484 return true
485 else
486 return false
487 end
488 else if lexer.accept_s then
489 var data: Buffer = new FlatBuffer
490
491 loop
492 if lexer.accept('?') then
493 if lexer.accept('>') then
494 break
495 else
496 data.chars.push('?')
497 if not lexer.expect_xml_char(data) then return false
498 end
499 else if not lexer.expect_xml_char(data) then
500 return false
501 end
502 end
503 model.fire_processing_instruction(target, data.to_s)
504 return true
505 else
506 return lexer.fire_unexpected_char(" after a processing " +
507 "instruction target. Expecting a white space or `?>`")
508 end
509 end
510
511 # Expect a `PI | XMLDecl` production, without the beginning.
512 #
513 # Assume `last_char` is the third byte of the production that is, the
514 # next byte after the `'<?'` token.
515 private fun expect_pi_or_xml_decl: Bool do
516 var buffer: Buffer = new FlatBuffer
517
518 if lexer.expect_name(buffer) then
519 var target = buffer.to_s
520
521 if target == "xml" then
522 return expect_xml_decl
523 else if lexer.check_pi_target(target) then
524 return expect_pi_data(target)
525 else
526 return false
527 end
528 else
529 return false
530 end
531 end
532
533 # Expect a `XMLDecl` production, without the initial `<?xml` token.
534 private fun expect_xml_decl: Bool do
535 if not expect_version_info then return false
536 if lexer.accept_s then
537 if lexer.is_char('e') then
538 if not expect_encoding_decl then return false
539 # At this point, we can only accept `S` or `'?>'`.
540 if not lexer.accept_s then
541 return lexer.expect_string("?>", "")
542 end
543 end
544 if lexer.is_char('s') and not expect_sd_decl then return false
545 return lexer.skip_s and lexer.expect_string("?>", "")
546 else
547 return lexer.expect_string("?>", "")
548 end
549 end
550
551 # Expect a `EncodingDecl` token, without the initial `S` token.
552 private fun expect_encoding_decl: Bool do
553 var encoding = new FlatBuffer
554
555 if not lexer.expect_string("encoding", "") or not lexer.expect_eq or
556 not expect_literal(encoding) then
557 return false
558 end
559 if not encoding.has("^[A-Za-z][A-Za-z0-9._-]*$".to_re) then
560 return model.fire_fatal_error("`{encoding.to_s}` is not a valid " +
561 "encoding name.", null)
562 end
563 # TODO: Do something with the value.
564 return true
565 end
566
567 # Expect a `SDDecl` token, without the initial `S` token.
568 private fun expect_sd_decl: Bool do
569 var buf = new FlatBuffer
570 var value: String
571
572 if not lexer.expect_string("standalone", "") or not lexer.expect_eq or
573 not expect_literal(buf) then
574 return false
575 end
576 value = buf.to_s
577 if not value == "yes" and not value == "no" then
578 return model.fire_fatal_error("`{value}` is not a valid value for " +
579 "the `standalone` declaration. Expecting `yes` or `no`.",
580 null)
581 end
582 # TODO: Do something with the value.
583 return true
584 end
585
586 # Expect a `CDSect` production, without the beginning.
587 #
588 # Assume `last_char` is the fourth byte of the production that is, the
589 # next byte after the `'<!['` token.
590 private fun expect_cd_sect: Bool do
591 var buffer: Buffer = new FlatBuffer
592
593 # Number of consecutive closing brackets.
594 var closing = 0
595
596 if lexer.expect_string("CDATA[",
597 " at the beginning of a CDATA section.") then
598 model.fire_start_cdata
599 loop
600 if lexer.accept(']') then
601 closing += 1
602 else
603 for i in [0..closing[ do
604 buffer.chars.push(']')
605 end
606 closing = 0
607 if closing >= 2 and lexer.accept('>') then break
608 if not lexer.expect_xml_char(buffer) then return false
609 end
610 end
611 flush(buffer)
612 model.fire_end_cdata
613 return true
614 else
615 return false
616 end
617 end
618
619 # Expect a `VersionInfo` production.
620 private fun expect_version_info: Bool do
621 if not lexer.expect_s or
622 not lexer.expect_string("version",
623 " in the first attribute name of the XML declaration") or
624 not lexer.expect_eq then
625 return false
626 else
627 var minor: Buffer = new FlatBuffer
628 var delimiter = lexer.expect_delimiter
629
630 if delimiter < 0 then return false
631 if not lexer.expect_string("1.", " as XML major version") or
632 not lexer.expect_digits(minor) or
633 not lexer.expect_int(delimiter, "") then
634 return false
635 end
636 if minor.to_s != "0" then
637 model.fire_warning("Only XML 1.0 is supported. " +
638 "Got a XML 1.{minor.to_s} document.", null)
639 end
640 return true
641 end
642 end
643
644 # Expect a `Reference`, without the initial `&`.
645 #
646 # Append the value to the buffer.
647 private fun expect_reference(buffer: Buffer): Bool do
648 # TODO: [WFC: Entity Declared]
649 # TODO: [VC: Entity Declared]
650 # TODO: [WFC: Parsed Entity]
651 # TODO: [WFC: No Recursion]
652 # TODO: Unicode
653
654 var ref = new FlatBuffer
655
656 if lexer.accept('#') then
657 if lexer.accept('x') then
658 if lexer.expect_hex(ref) then
659 buffer.chars.add(ref.to_hex.ascii)
660 return lexer.expect(';', "")
661 else
662 return lexer.fire_unexpected_char(
663 ". Expecting an hexadecimal digit")
664 end
665 else if lexer.accept_digits(ref) then
666 buffer.chars.add(ref.to_i.ascii)
667 return lexer.expect(';', "")
668 else
669 return lexer.fire_unexpected_char(" in a character reference. " +
670 "Expecting `x` or a decimal digit")
671 end
672 else if lexer.expect_name(ref) then
673 var name = ref.to_s
674 if name.has(":") then
675 model.fire_error("The entity name `{name}` contains a colon.", null)
676 end
677 var value = resolve_reference(name)
678
679 if value != null then
680 buffer.append(value)
681 return lexer.expect(';', "")
682 else
683 model.fire_fatal_error("Unknown entity `{name}`.", null)
684 return false
685 end
686 else
687 return lexer.fire_unexpected_char(
688 " in a reference. Expecting `#` or a name")
689 end
690 end
691
692 # Resolve the entity reference or return `null`.
693 private fun resolve_reference(name: String): nullable String do
694 if name == "lt" then
695 return "<"
696 else if name == "gt" then
697 return ">"
698 else if name == "amp" then
699 return "&"
700 else if name == "quot" then
701 return "\""
702 else if name == "apos" then
703 return "'"
704 else
705 return null
706 end
707 # TODO: Support non-builtin entities
708 end
709
710 # Flush the specified buffer as a `characters` event.
711 #
712 # Do nothing if `buffer` is empty.
713 private fun flush(buffer: Buffer) do
714 if buffer.length > 0 then
715 model.fire_characters(buffer.to_s)
716 buffer.clear
717 end
718 end
719
720
721 ############################################################################
722 # Paths
723
724 # Resolve the specified system id.
725 private fun resolve_system_id(system_id: String): MaybeError[String, Error] do
726 return realpath(system_id)
727 # TODO: handle URIs
728 end
729
730 # Resolve the specified POSIX path.
731 #
732 # Like `String.realpath`, but with error handling.
733 private fun realpath(path: String): MaybeError[String, Error] do
734 var cs = path.to_cstring.file_realpath
735
736 if cs.address_is_null then
737 return new MaybeError[String, Error](null,
738 new Error("File <{path}> not found."))
739 else
740 return new MaybeError[String, Error](cs.to_s, null)
741 end
742 end
743 end