stream: move decoder to stream and rename as codec
[nit.git] / lib / core / stream.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # This file is free software, which comes along with NIT. This software is
4 # distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
5 # without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
6 # PARTICULAR PURPOSE. You can modify it is you want, provided this header
7 # is kept unaltered, and a notification of the changes is added.
8 # You are allowed to redistribute it and sell it, alone or is a part of
9 # another product.
10
11 # Input and output streams of characters
12 module stream
13
14 intrude import text::ropes
15 import error
16 intrude import bytes
17 import codecs
18
19 in "C" `{
20 #include <unistd.h>
21 #include <string.h>
22 #include <signal.h>
23 `}
24
25 # Any kind of error that could be produced by an operation on Streams
26 class IOError
27 super Error
28 end
29
30 # Any kind of stream to read/write/both to or from a source
31 abstract class Stream
32 # Codec used to transform raw data to text
33 #
34 # Note: defaults to UTF-8
35 var codec: Codec = utf8_codec is protected writable(set_codec)
36
37 # Lookahead buffer for codecs
38 #
39 # Since some codecs are multibyte, a lookahead may be required
40 # to store the next bytes and consume them only if a valid character
41 # is read.
42 protected var lookahead: CString is noinit
43
44 # Capacity of the lookahead
45 protected var lookahead_capacity = 0
46
47 # Current occupation of the lookahead
48 protected var lookahead_length = 0
49
50 # Buffer for writing data to a stream
51 protected var write_buffer: CString is noinit
52
53 init do
54 var lcap = codec.max_lookahead
55 lookahead = new CString(lcap)
56 write_buffer = new CString(lcap)
57 lookahead_length = 0
58 lookahead_capacity = lcap
59 end
60
61 # Change the codec for this stream.
62 fun codec=(c: Codec) do
63 if c.max_lookahead > lookahead_capacity then
64 var lcap = codec.max_lookahead
65 var lk = new CString(lcap)
66 var llen = lookahead_length
67 if llen > 0 then
68 lookahead.copy_to(lk, llen, 0, 0)
69 end
70 lookahead = lk
71 lookahead_capacity = lcap
72 write_buffer = new CString(lcap)
73 end
74 set_codec(c)
75 end
76
77 # Error produced by the file stream
78 #
79 # var ifs = new FileReader.open("donotmakethisfile.binx")
80 # ifs.read_all
81 # ifs.close
82 # assert ifs.last_error != null
83 var last_error: nullable IOError = null
84
85 # close the stream
86 fun close is abstract
87
88 # Pre-work hook.
89 #
90 # Used to inform `self` that operations will start.
91 # Specific streams can use this to prepare some resources.
92 #
93 # Is automatically invoked at the beginning of `with` structures.
94 #
95 # Do nothing by default.
96 fun start do end
97
98 # Post-work hook.
99 #
100 # Used to inform `self` that the operations are over.
101 # Specific streams can use this to free some resources.
102 #
103 # Is automatically invoked at the end of `with` structures.
104 #
105 # call `close` by default.
106 fun finish do close
107 end
108
109 # A `Stream` that can be read from
110 abstract class Reader
111 super Stream
112
113 # Reads a character. Returns `null` on EOF or timeout
114 fun read_char: nullable Char is abstract
115
116 # Reads a byte. Returns `null` on EOF or timeout
117 fun read_byte: nullable Byte is abstract
118
119 # Reads a String of at most `i` length
120 fun read(i: Int): String do return read_bytes(i).to_s
121
122 # Read at most i bytes
123 fun read_bytes(i: Int): Bytes
124 do
125 if last_error != null then return new Bytes.empty
126 var s = new CString(i)
127 var buf = new Bytes(s, 0, 0)
128 while i > 0 and not eof do
129 var c = read_byte
130 if c != null then
131 buf.add c
132 i -= 1
133 end
134 end
135 return buf
136 end
137
138 # Read a string until the end of the line.
139 #
140 # The line terminator '\n' and '\r\n', if any, is removed in each line.
141 #
142 # ~~~
143 # var txt = "Hello\n\nWorld\n"
144 # var i = new StringReader(txt)
145 # assert i.read_line == "Hello"
146 # assert i.read_line == ""
147 # assert i.read_line == "World"
148 # assert i.eof
149 # ~~~
150 #
151 # Only LINE FEED (`\n`), CARRIAGE RETURN & LINE FEED (`\r\n`), and
152 # the end or file (EOF) is considered to delimit the end of lines.
153 # CARRIAGE RETURN (`\r`) alone is not used for the end of line.
154 #
155 # ~~~
156 # var txt2 = "Hello\r\n\n\rWorld"
157 # var i2 = new StringReader(txt2)
158 # assert i2.read_line == "Hello"
159 # assert i2.read_line == ""
160 # assert i2.read_line == "\rWorld"
161 # assert i2.eof
162 # ~~~
163 #
164 # NOTE: Use `append_line_to` if the line terminator needs to be preserved.
165 fun read_line: String
166 do
167 if last_error != null then return ""
168 if eof then return ""
169 var s = new FlatBuffer
170 append_line_to(s)
171 return s.to_s.chomp
172 end
173
174 # Read all the lines until the eof.
175 #
176 # The line terminator '\n' and `\r\n` is removed in each line,
177 #
178 # ~~~
179 # var txt = "Hello\n\nWorld\n"
180 # var i = new StringReader(txt)
181 # assert i.read_lines == ["Hello", "", "World"]
182 # ~~~
183 #
184 # This method is more efficient that splitting
185 # the result of `read_all`.
186 #
187 # NOTE: SEE `read_line` for details.
188 fun read_lines: Array[String]
189 do
190 var res = new Array[String]
191 while not eof do
192 res.add read_line
193 end
194 return res
195 end
196
197 # Return an iterator that read each line.
198 #
199 # The line terminator '\n' and `\r\n` is removed in each line,
200 # The line are read with `read_line`. See this method for details.
201 #
202 # ~~~
203 # var txt = "Hello\n\nWorld\n"
204 # var i = new StringReader(txt)
205 # assert i.each_line.to_a == ["Hello", "", "World"]
206 # ~~~
207 #
208 # Unlike `read_lines` that read all lines at the call, `each_line` is lazy.
209 # Therefore, the stream should no be closed until the end of the stream.
210 #
211 # ~~~
212 # i = new StringReader(txt)
213 # var el = i.each_line
214 #
215 # assert el.item == "Hello"
216 # el.next
217 # assert el.item == ""
218 # el.next
219 #
220 # i.close
221 #
222 # assert not el.is_ok
223 # # closed before "world" is read
224 # ~~~
225 fun each_line: LineIterator do return new LineIterator(self)
226
227 # Read all the stream until the eof.
228 #
229 # The content of the file is returned as a String.
230 #
231 # ~~~
232 # var txt = "Hello\n\nWorld\n"
233 # var i = new StringReader(txt)
234 # assert i.read_all == txt
235 # ~~~
236 fun read_all: String do
237 var s = read_all_bytes
238 var slen = s.length
239 if slen == 0 then return ""
240 var rets = ""
241 var pos = 0
242 var str = s.items.clean_utf8(slen)
243 slen = str.byte_length
244 var sits = str.items
245 var remsp = slen
246 while pos < slen do
247 # The 129 size was decided more or less arbitrarily
248 # It will require some more benchmarking to compute
249 # if this is the best size or not
250 var chunksz = 129
251 if chunksz > remsp then
252 rets += new FlatString.with_infos(sits, remsp, pos)
253 break
254 end
255 var st = sits.find_beginning_of_char_at(pos + chunksz - 1)
256 var byte_length = st - pos
257 rets += new FlatString.with_infos(sits, byte_length, pos)
258 pos = st
259 remsp -= byte_length
260 end
261 if rets isa Concat then return rets.balance
262 return rets
263 end
264
265 # Read all the stream until the eof.
266 #
267 # The content of the file is returned verbatim.
268 fun read_all_bytes: Bytes
269 do
270 if last_error != null then return new Bytes.empty
271 var s = new Bytes.empty
272 while not eof do
273 var c = read_byte
274 if c != null then s.add(c)
275 end
276 return s
277 end
278
279 # Read a string until the end of the line and append it to `s`.
280 #
281 # Unlike `read_line` and other related methods,
282 # the line terminator '\n', if any, is preserved in each line.
283 # Use the method `Text::chomp` to safely remove it.
284 #
285 # ~~~
286 # var txt = "Hello\n\nWorld\n"
287 # var i = new StringReader(txt)
288 # var b = new FlatBuffer
289 # i.append_line_to(b)
290 # assert b == "Hello\n"
291 # i.append_line_to(b)
292 # assert b == "Hello\n\n"
293 # i.append_line_to(b)
294 # assert b == txt
295 # assert i.eof
296 # ~~~
297 #
298 # If `\n` is not present at the end of the result, it means that
299 # a non-eol terminated last line was returned.
300 #
301 # ~~~
302 # var i2 = new StringReader("hello")
303 # assert not i2.eof
304 # var b2 = new FlatBuffer
305 # i2.append_line_to(b2)
306 # assert b2 == "hello"
307 # assert i2.eof
308 # ~~~
309 #
310 # NOTE: The single character LINE FEED (`\n`) delimits the end of lines.
311 # Therefore CARRIAGE RETURN & LINE FEED (`\r\n`) is also recognized.
312 fun append_line_to(s: Buffer)
313 do
314 if last_error != null then return
315 loop
316 var x = read_char
317 if x == null then
318 if eof then return
319 else
320 s.chars.push(x)
321 if x == '\n' then return
322 end
323 end
324 end
325
326 # Is there something to read.
327 # This function returns 'false' if there is something to read.
328 fun eof: Bool is abstract
329
330 # Read the next sequence of non whitespace characters.
331 #
332 # Leading whitespace characters are skipped.
333 # The first whitespace character that follows the result is consumed.
334 #
335 # An empty string is returned if the end of the file or an error is encounter.
336 #
337 # ~~~
338 # var w = new StringReader(" Hello, \n\t World!")
339 # assert w.read_word == "Hello,"
340 # assert w.read_char == '\n'
341 # assert w.read_word == "World!"
342 # assert w.read_word == ""
343 # ~~~
344 #
345 # `Char::is_whitespace` determines what is a whitespace.
346 fun read_word: String
347 do
348 var buf = new FlatBuffer
349 var c = read_nonwhitespace
350 if c != null then
351 buf.add(c)
352 while not eof do
353 c = read_char
354 if c == null then break
355 if c.is_whitespace then break
356 buf.add(c)
357 end
358 end
359 var res = buf.to_s
360 return res
361 end
362
363 # Skip whitespace characters (if any) then return the following non-whitespace character.
364 #
365 # Returns the code point of the character.
366 # Returns `null` on end of file or error.
367 #
368 # In fact, this method works like `read_char` except it skips whitespace.
369 #
370 # ~~~
371 # var w = new StringReader(" \nab\tc")
372 # assert w.read_nonwhitespace == 'a'
373 # assert w.read_nonwhitespace == 'b'
374 # assert w.read_nonwhitespace == 'c'
375 # assert w.read_nonwhitespace == null
376 # ~~~
377 #
378 # `Char::is_whitespace` determines what is a whitespace.
379 fun read_nonwhitespace: nullable Char
380 do
381 var c: nullable Char = null
382 while not eof do
383 c = read_char
384 if c == null or not c.is_whitespace then break
385 end
386 return c
387 end
388 end
389
390 # Iterator returned by `Reader::each_line`.
391 # See the aforementioned method for details.
392 class LineIterator
393 super Iterator[String]
394
395 # The original stream
396 var stream: Reader
397
398 redef fun is_ok
399 do
400 var res = not stream.eof
401 if not res and close_on_finish then stream.close
402 return res
403 end
404
405 redef fun item
406 do
407 var line = self.line
408 if line == null then
409 line = stream.read_line
410 end
411 self.line = line
412 return line
413 end
414
415 # The last line read (cache)
416 private var line: nullable String = null
417
418 redef fun next
419 do
420 # force the read
421 if line == null then item
422 # drop the line
423 line = null
424 end
425
426 # Close the stream when the stream is at the EOF.
427 #
428 # Default is false.
429 var close_on_finish = false is writable
430
431 redef fun finish
432 do
433 if close_on_finish then stream.close
434 end
435 end
436
437 # `Reader` capable of declaring if readable without blocking
438 abstract class PollableReader
439 super Reader
440
441 # Is there something to read? (without blocking)
442 fun poll_in: Bool is abstract
443
444 end
445
446 # A `Stream` that can be written to
447 abstract class Writer
448 super Stream
449
450 # Writes bytes from `s`
451 fun write_bytes(s: Bytes) is abstract
452
453 # write a string
454 fun write(s: Text) is abstract
455
456 # Write a single byte
457 fun write_byte(value: Byte) is abstract
458
459 # Writes a single char
460 fun write_char(c: Char) do write(c.to_s)
461
462 # Can the stream be used to write
463 fun is_writable: Bool is abstract
464 end
465
466 # Things that can be efficienlty written to a `Writer`
467 #
468 # The point of this interface is to allow the instance to be efficiently
469 # written into a `Writer`.
470 #
471 # Ready-to-save documents usually provide this interface.
472 interface Writable
473 # Write itself to a `stream`
474 # The specific logic it let to the concrete subclasses
475 fun write_to(stream: Writer) is abstract
476
477 # Like `write_to` but return a new String (may be quite large)
478 #
479 # This funtionality is anectodical, since the point
480 # of streamable object to to be efficienlty written to a
481 # stream without having to allocate and concatenate strings
482 fun write_to_string: String
483 do
484 var stream = new StringWriter
485 write_to(stream)
486 return stream.to_s
487 end
488 end
489
490 redef class Bytes
491 super Writable
492 redef fun write_to(s) do s.write_bytes(self)
493
494 redef fun write_to_string do return to_s
495 end
496
497 redef class Text
498 super Writable
499 redef fun write_to(stream) do stream.write(self)
500 end
501
502 # Input streams with a buffered input for efficiency purposes
503 abstract class BufferedReader
504 super Reader
505 redef fun read_char
506 do
507 if last_error != null then return null
508 if eof then
509 last_error = new IOError("Stream has reached eof")
510 return null
511 end
512 # TODO: Fix when supporting UTF-8
513 var c = _buffer[_buffer_pos].to_i.code_point
514 _buffer_pos += 1
515 return c
516 end
517
518 redef fun read_byte
519 do
520 if last_error != null then return null
521 if eof then
522 last_error = new IOError("Stream has reached eof")
523 return null
524 end
525 var c = _buffer[_buffer_pos]
526 _buffer_pos += 1
527 return c
528 end
529
530 # Resets the internal buffer
531 fun buffer_reset do
532 _buffer_length = 0
533 _buffer_pos = 0
534 end
535
536 # Peeks up to `n` bytes in the buffer
537 #
538 # The operation does not consume the buffer
539 #
540 # ~~~nitish
541 # var x = new FileReader.open("File.txt")
542 # assert x.peek(5) == x.read(5)
543 # ~~~
544 fun peek(i: Int): Bytes do
545 if eof then return new Bytes.empty
546 var remsp = _buffer_length - _buffer_pos
547 if i <= remsp then
548 var bf = new Bytes.with_capacity(i)
549 bf.append_ns_from(_buffer, i, _buffer_pos)
550 return bf
551 end
552 var bf = new Bytes.with_capacity(i)
553 bf.append_ns_from(_buffer, remsp, _buffer_pos)
554 _buffer_pos = _buffer_length
555 read_intern(i - bf.length, bf)
556 remsp = _buffer_length - _buffer_pos
557 var full_len = bf.length + remsp
558 if full_len > _buffer_capacity then
559 var c = _buffer_capacity
560 while c < full_len do c = c * 2 + 2
561 _buffer_capacity = c
562 end
563 var nns = new CString(_buffer_capacity)
564 bf.items.copy_to(nns, bf.length, 0, 0)
565 _buffer.copy_to(nns, remsp, _buffer_pos, bf.length)
566 _buffer = nns
567 _buffer_pos = 0
568 _buffer_length = full_len
569 return bf
570 end
571
572 redef fun read_bytes(i)
573 do
574 if last_error != null then return new Bytes.empty
575 var buf = new Bytes.with_capacity(i)
576 read_intern(i, buf)
577 return buf
578 end
579
580 # Fills `buf` with at most `i` bytes read from `self`
581 private fun read_intern(i: Int, buf: Bytes): Int do
582 if eof then return 0
583 var p = _buffer_pos
584 var bufsp = _buffer_length - p
585 if bufsp >= i then
586 _buffer_pos += i
587 buf.append_ns_from(_buffer, i, p)
588 return i
589 end
590 _buffer_pos = _buffer_length
591 var readln = _buffer_length - p
592 buf.append_ns_from(_buffer, readln, p)
593 var rd = read_intern(i - readln, buf)
594 return rd + readln
595 end
596
597 redef fun read_all_bytes
598 do
599 if last_error != null then return new Bytes.empty
600 var s = new Bytes.with_capacity(10)
601 var b = _buffer
602 while not eof do
603 var j = _buffer_pos
604 var k = _buffer_length
605 var rd_sz = k - j
606 s.append_ns_from(b, rd_sz, j)
607 _buffer_pos = k
608 fill_buffer
609 end
610 return s
611 end
612
613 redef fun append_line_to(s)
614 do
615 var lb = new Bytes.with_capacity(10)
616 loop
617 # First phase: look for a '\n'
618 var i = _buffer_pos
619 while i < _buffer_length and _buffer[i] != 0xAu8 do
620 i += 1
621 end
622
623 var eol
624 if i < _buffer_length then
625 assert _buffer[i] == 0xAu8
626 i += 1
627 eol = true
628 else
629 eol = false
630 end
631
632 # if there is something to append
633 if i > _buffer_pos then
634 # Copy from the buffer to the string
635 var j = _buffer_pos
636 while j < i do
637 lb.add(_buffer[j])
638 j += 1
639 end
640 _buffer_pos = i
641 else
642 assert end_reached
643 s.append lb.to_s
644 return
645 end
646
647 if eol then
648 # so \n is found
649 s.append lb.to_s
650 return
651 else
652 # so \n is not found
653 if end_reached then
654 s.append lb.to_s
655 return
656 end
657 fill_buffer
658 end
659 end
660 end
661
662 redef fun eof
663 do
664 if _buffer_pos < _buffer_length then return false
665 if end_reached then return true
666 fill_buffer
667 return _buffer_pos >= _buffer_length and end_reached
668 end
669
670 # The buffer
671 private var buffer: CString = new CString(0)
672
673 # The current position in the buffer
674 private var buffer_pos = 0
675
676 # Length of the current buffer (i.e. nuber of bytes in the buffer)
677 private var buffer_length = 0
678
679 # Capacity of the buffer
680 private var buffer_capacity = 0
681
682 # Fill the buffer
683 protected fun fill_buffer is abstract
684
685 # Has the last fill_buffer reached the end
686 protected fun end_reached: Bool is abstract
687
688 # Allocate a `_buffer` for a given `capacity`.
689 protected fun prepare_buffer(capacity: Int)
690 do
691 _buffer = new CString(capacity)
692 _buffer_pos = 0 # need to read
693 _buffer_length = 0
694 _buffer_capacity = capacity
695 end
696 end
697
698 # A `Stream` that can be written to and read from
699 abstract class Duplex
700 super Reader
701 super Writer
702 end
703
704 # Write to `bytes` in memory
705 #
706 # ~~~
707 # var writer = new BytesWriter
708 #
709 # writer.write "Strings "
710 # writer.write_char '&'
711 # writer.write_byte 0x20u8
712 # writer.write_bytes "bytes".to_bytes
713 #
714 # assert writer.to_s == "\\x53\\x74\\x72\\x69\\x6E\\x67\\x73\\x20\\x26\\x20\\x62\\x79\\x74\\x65\\x73"
715 # assert writer.bytes.to_s == "Strings & bytes"
716 # ~~~
717 #
718 # As with any binary data, UTF-8 code points encoded on two bytes or more
719 # can be constructed byte by byte.
720 #
721 # ~~~
722 # writer = new BytesWriter
723 #
724 # # Write just the character first half
725 # writer.write_byte 0xC2u8
726 # assert writer.to_s == "\\xC2"
727 # assert writer.bytes.to_s == "�"
728 #
729 # # Complete the character
730 # writer.write_byte 0xA2u8
731 # assert writer.to_s == "\\xC2\\xA2"
732 # assert writer.bytes.to_s == "¢"
733 # ~~~
734 class BytesWriter
735 super Writer
736
737 # Written memory
738 var bytes = new Bytes.empty
739
740 redef fun to_s do return bytes.chexdigest
741
742 redef fun write(str)
743 do
744 if closed then return
745 str.append_to_bytes bytes
746 end
747
748 redef fun write_char(c)
749 do
750 if closed then return
751 bytes.add_char c
752 end
753
754 redef fun write_byte(value)
755 do
756 if closed then return
757 bytes.add value
758 end
759
760 redef fun write_bytes(b)
761 do
762 if closed then return
763 bytes.append b
764 end
765
766 # Is the stream closed?
767 protected var closed = false
768
769 redef fun close do closed = true
770 redef fun is_writable do return not closed
771 end
772
773 # `Stream` writing to a `String`
774 #
775 # This class has the same behavior as `BytesWriter`
776 # except for `to_s` which decodes `bytes` to a string.
777 #
778 # ~~~
779 # var writer = new StringWriter
780 #
781 # writer.write "Strings "
782 # writer.write_char '&'
783 # writer.write_byte 0x20u8
784 # writer.write_bytes "bytes".to_bytes
785 #
786 # assert writer.to_s == "Strings & bytes"
787 # ~~~
788 class StringWriter
789 super BytesWriter
790
791 redef fun to_s do return bytes.to_s
792 end
793
794 # Read from `bytes` in memory
795 #
796 # ~~~
797 # var reader = new BytesReader(b"a…b")
798 # assert reader.read_char == 'a'
799 # assert reader.read_byte == 0xE2u8 # 1st byte of '…'
800 # assert reader.read_byte == 0x80u8 # 2nd byte of '…'
801 # assert reader.read_char == '�' # Reads the last byte as an invalid char
802 # assert reader.read_all_bytes == b"b"
803 # ~~~
804 class BytesReader
805 super Reader
806
807 # Source data to read
808 var bytes: Bytes
809
810 # The current position in `bytes`
811 private var cursor = 0
812
813 redef fun read_char
814 do
815 if cursor >= bytes.length then return null
816
817 var len = bytes.items.length_of_char_at(cursor)
818 var char = bytes.items.char_at(cursor)
819 cursor += len
820 return char
821 end
822
823 redef fun read_byte
824 do
825 if cursor >= bytes.length then return null
826
827 var c = bytes[cursor]
828 cursor += 1
829 return c
830 end
831
832 redef fun close do bytes = new Bytes.empty
833
834 redef fun read_all_bytes
835 do
836 var res = bytes.slice_from(cursor)
837 cursor = bytes.length
838 return res
839 end
840
841 redef fun eof do return cursor >= bytes.length
842 end
843
844 # `Stream` reading from a `String` source
845 #
846 # This class has the same behavior as `BytesReader`
847 # except for its constructor accepting a `String`.
848 #
849 # ~~~
850 # var reader = new StringReader("a…b")
851 # assert reader.read_char == 'a'
852 # assert reader.read_byte == 0xE2u8 # 1st byte of '…'
853 # assert reader.read_byte == 0x80u8 # 2nd byte of '…'
854 # assert reader.read_char == '�' # Reads the last byte as an invalid char
855 # assert reader.read_all == "b"
856 # ~~~
857 class StringReader
858 super BytesReader
859
860 autoinit source
861
862 # Source data to read
863 var source: String
864
865 init do bytes = source.to_bytes
866
867 redef fun close
868 do
869 source = ""
870 super
871 end
872 end