stream: move decoder to stream and rename as codec
[nit.git] / lib / core / stream.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # This file is free software, which comes along with NIT. This software is
4 # distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
5 # without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
6 # PARTICULAR PURPOSE. You can modify it is you want, provided this header
7 # is kept unaltered, and a notification of the changes is added.
8 # You are allowed to redistribute it and sell it, alone or is a part of
9 # another product.
10
11 # Input and output streams of characters
12 module stream
13
14 intrude import text::ropes
15 import error
16 intrude import bytes
17 import codecs
18
19 in "C" `{
20 #include <unistd.h>
21 #include <string.h>
22 #include <signal.h>
23 `}
24
25 # Any kind of error that could be produced by an operation on Streams
26 class IOError
27 super Error
28 end
29
30 # Any kind of stream to read/write/both to or from a source
31 abstract class Stream
32 # Codec used to transform raw data to text
33 #
34 # Note: defaults to UTF-8
35 var codec: Codec = utf8_codec is protected writable(set_codec)
36
37 # Lookahead buffer for codecs
38 #
39 # Since some codecs are multibyte, a lookahead may be required
40 # to store the next bytes and consume them only if a valid character
41 # is read.
42 protected var lookahead: CString is noinit
43
44 # Capacity of the lookahead
45 protected var lookahead_capacity = 0
46
47 # Current occupation of the lookahead
48 protected var lookahead_length = 0
49
50 # Buffer for writing data to a stream
51 protected var write_buffer: CString is noinit
52
53 init do
54 var lcap = codec.max_lookahead
55 lookahead = new CString(lcap)
56 write_buffer = new CString(lcap)
57 lookahead_length = 0
58 lookahead_capacity = lcap
59 end
60
61 # Change the codec for this stream.
62 fun codec=(c: Codec) do
63 if c.max_lookahead > lookahead_capacity then
64 var lcap = codec.max_lookahead
65 var lk = new CString(lcap)
66 var llen = lookahead_length
67 if llen > 0 then
68 lookahead.copy_to(lk, llen, 0, 0)
69 end
70 lookahead = lk
71 lookahead_capacity = lcap
72 write_buffer = new CString(lcap)
73 end
74 set_codec(c)
75 end
76
77 # Error produced by the file stream
78 #
79 # var ifs = new FileReader.open("donotmakethisfile.binx")
80 # ifs.read_all
81 # ifs.close
82 # assert ifs.last_error != null
83 var last_error: nullable IOError = null
84
85 # close the stream
86 fun close is abstract
87
88 # Pre-work hook.
89 #
90 # Used to inform `self` that operations will start.
91 # Specific streams can use this to prepare some resources.
92 #
93 # Is automatically invoked at the beginning of `with` structures.
94 #
95 # Do nothing by default.
96 fun start do end
97
98 # Post-work hook.
99 #
100 # Used to inform `self` that the operations are over.
101 # Specific streams can use this to free some resources.
102 #
103 # Is automatically invoked at the end of `with` structures.
104 #
105 # call `close` by default.
106 fun finish do close
107 end
108
109 # A `Stream` that can be read from
110 abstract class Reader
111 super Stream
112
113 # Reads a character. Returns `null` on EOF or timeout
114 fun read_char: nullable Char is abstract
115
116 # Reads a byte. Returns a negative value on error
117 fun read_byte: Int is abstract
118
119 # Reads a String of at most `i` length
120 fun read(i: Int): String do return read_bytes(i).to_s
121
122 # Read at most i bytes
123 #
124 # If i <= 0, an empty buffer will be returned
125 fun read_bytes(i: Int): Bytes
126 do
127 if last_error != null or i <= 0 then return new Bytes.empty
128 var s = new CString(i)
129 var buf = new Bytes(s, 0, i)
130 while i > 0 and not eof do
131 var c = read_byte
132 if c < 0 then
133 continue
134 end
135 buf.add c.to_b
136 i -= 1
137 end
138 return buf
139 end
140
141 # Read a string until the end of the line.
142 #
143 # The line terminator '\n' and '\r\n', if any, is removed in each line.
144 #
145 # ~~~
146 # var txt = "Hello\n\nWorld\n"
147 # var i = new StringReader(txt)
148 # assert i.read_line == "Hello"
149 # assert i.read_line == ""
150 # assert i.read_line == "World"
151 # assert i.eof
152 # ~~~
153 #
154 # Only LINE FEED (`\n`), CARRIAGE RETURN & LINE FEED (`\r\n`), and
155 # the end or file (EOF) is considered to delimit the end of lines.
156 # CARRIAGE RETURN (`\r`) alone is not used for the end of line.
157 #
158 # ~~~
159 # var txt2 = "Hello\r\n\n\rWorld"
160 # var i2 = new StringReader(txt2)
161 # assert i2.read_line == "Hello"
162 # assert i2.read_line == ""
163 # assert i2.read_line == "\rWorld"
164 # assert i2.eof
165 # ~~~
166 #
167 # NOTE: Use `append_line_to` if the line terminator needs to be preserved.
168 fun read_line: String
169 do
170 if last_error != null then return ""
171 if eof then return ""
172 var s = new FlatBuffer
173 append_line_to(s)
174 return s.to_s.chomp
175 end
176
177 # Read all the lines until the eof.
178 #
179 # The line terminator '\n' and `\r\n` is removed in each line,
180 #
181 # ~~~
182 # var txt = "Hello\n\nWorld\n"
183 # var i = new StringReader(txt)
184 # assert i.read_lines == ["Hello", "", "World"]
185 # ~~~
186 #
187 # This method is more efficient that splitting
188 # the result of `read_all`.
189 #
190 # NOTE: SEE `read_line` for details.
191 fun read_lines: Array[String]
192 do
193 var res = new Array[String]
194 while not eof do
195 res.add read_line
196 end
197 return res
198 end
199
200 # Return an iterator that read each line.
201 #
202 # The line terminator '\n' and `\r\n` is removed in each line,
203 # The line are read with `read_line`. See this method for details.
204 #
205 # ~~~
206 # var txt = "Hello\n\nWorld\n"
207 # var i = new StringReader(txt)
208 # assert i.each_line.to_a == ["Hello", "", "World"]
209 # ~~~
210 #
211 # Unlike `read_lines` that read all lines at the call, `each_line` is lazy.
212 # Therefore, the stream should no be closed until the end of the stream.
213 #
214 # ~~~
215 # i = new StringReader(txt)
216 # var el = i.each_line
217 #
218 # assert el.item == "Hello"
219 # el.next
220 # assert el.item == ""
221 # el.next
222 #
223 # i.close
224 #
225 # assert not el.is_ok
226 # # closed before "world" is read
227 # ~~~
228 fun each_line: LineIterator do return new LineIterator(self)
229
230 # Read all the stream until the eof.
231 #
232 # The content of the file is returned as a String.
233 #
234 # ~~~
235 # var txt = "Hello\n\nWorld\n"
236 # var i = new StringReader(txt)
237 # assert i.read_all == txt
238 # ~~~
239 fun read_all: String do
240 var s = read_all_bytes
241 var slen = s.length
242 if slen == 0 then return ""
243 var rets = ""
244 var pos = 0
245 var str = s.items.clean_utf8(slen)
246 slen = str.byte_length
247 var sits = str.items
248 var remsp = slen
249 while pos < slen do
250 # The 129 size was decided more or less arbitrarily
251 # It will require some more benchmarking to compute
252 # if this is the best size or not
253 var chunksz = 129
254 if chunksz > remsp then
255 rets += new FlatString.with_infos(sits, remsp, pos)
256 break
257 end
258 var st = sits.find_beginning_of_char_at(pos + chunksz - 1)
259 var byte_length = st - pos
260 rets += new FlatString.with_infos(sits, byte_length, pos)
261 pos = st
262 remsp -= byte_length
263 end
264 if rets isa Concat then return rets.balance
265 return rets
266 end
267
268 # Read all the stream until the eof.
269 #
270 # The content of the file is returned verbatim.
271 fun read_all_bytes: Bytes
272 do
273 if last_error != null then return new Bytes.empty
274 var s = new Bytes.empty
275 while not eof do
276 var c = read_byte
277 if c < 0 then continue
278 s.add(c.to_b)
279 end
280 return s
281 end
282
283 # Read a string until the end of the line and append it to `s`.
284 #
285 # Unlike `read_line` and other related methods,
286 # the line terminator '\n', if any, is preserved in each line.
287 # Use the method `Text::chomp` to safely remove it.
288 #
289 # ~~~
290 # var txt = "Hello\n\nWorld\n"
291 # var i = new StringReader(txt)
292 # var b = new FlatBuffer
293 # i.append_line_to(b)
294 # assert b == "Hello\n"
295 # i.append_line_to(b)
296 # assert b == "Hello\n\n"
297 # i.append_line_to(b)
298 # assert b == txt
299 # assert i.eof
300 # ~~~
301 #
302 # If `\n` is not present at the end of the result, it means that
303 # a non-eol terminated last line was returned.
304 #
305 # ~~~
306 # var i2 = new StringReader("hello")
307 # assert not i2.eof
308 # var b2 = new FlatBuffer
309 # i2.append_line_to(b2)
310 # assert b2 == "hello"
311 # assert i2.eof
312 # ~~~
313 #
314 # NOTE: The single character LINE FEED (`\n`) delimits the end of lines.
315 # Therefore CARRIAGE RETURN & LINE FEED (`\r\n`) is also recognized.
316 fun append_line_to(s: Buffer)
317 do
318 if last_error != null then return
319 loop
320 var x = read_char
321 if x == null then
322 if eof then return
323 else
324 s.chars.push(x)
325 if x == '\n' then return
326 end
327 end
328 end
329
330 # Is there something to read.
331 # This function returns 'false' if there is something to read.
332 fun eof: Bool is abstract
333
334 # Read the next sequence of non whitespace characters.
335 #
336 # Leading whitespace characters are skipped.
337 # The first whitespace character that follows the result is consumed.
338 #
339 # An empty string is returned if the end of the file or an error is encounter.
340 #
341 # ~~~
342 # var w = new StringReader(" Hello, \n\t World!")
343 # assert w.read_word == "Hello,"
344 # assert w.read_char == '\n'
345 # assert w.read_word == "World!"
346 # assert w.read_word == ""
347 # ~~~
348 #
349 # `Char::is_whitespace` determines what is a whitespace.
350 fun read_word: String
351 do
352 var buf = new FlatBuffer
353 var c = read_nonwhitespace
354 if c != null then
355 buf.add(c)
356 while not eof do
357 c = read_char
358 if c == null then break
359 if c.is_whitespace then break
360 buf.add(c)
361 end
362 end
363 var res = buf.to_s
364 return res
365 end
366
367 # Skip whitespace characters (if any) then return the following non-whitespace character.
368 #
369 # Returns the code point of the character.
370 # Returns `null` on end of file or error.
371 #
372 # In fact, this method works like `read_char` except it skips whitespace.
373 #
374 # ~~~
375 # var w = new StringReader(" \nab\tc")
376 # assert w.read_nonwhitespace == 'a'
377 # assert w.read_nonwhitespace == 'b'
378 # assert w.read_nonwhitespace == 'c'
379 # assert w.read_nonwhitespace == null
380 # ~~~
381 #
382 # `Char::is_whitespace` determines what is a whitespace.
383 fun read_nonwhitespace: nullable Char
384 do
385 var c: nullable Char = null
386 while not eof do
387 c = read_char
388 if c == null or not c.is_whitespace then break
389 end
390 return c
391 end
392 end
393
394 # Iterator returned by `Reader::each_line`.
395 # See the aforementioned method for details.
396 class LineIterator
397 super Iterator[String]
398
399 # The original stream
400 var stream: Reader
401
402 redef fun is_ok
403 do
404 var res = not stream.eof
405 if not res and close_on_finish then stream.close
406 return res
407 end
408
409 redef fun item
410 do
411 var line = self.line
412 if line == null then
413 line = stream.read_line
414 end
415 self.line = line
416 return line
417 end
418
419 # The last line read (cache)
420 private var line: nullable String = null
421
422 redef fun next
423 do
424 # force the read
425 if line == null then item
426 # drop the line
427 line = null
428 end
429
430 # Close the stream when the stream is at the EOF.
431 #
432 # Default is false.
433 var close_on_finish = false is writable
434
435 redef fun finish
436 do
437 if close_on_finish then stream.close
438 end
439 end
440
441 # `Reader` capable of declaring if readable without blocking
442 abstract class PollableReader
443 super Reader
444
445 # Is there something to read? (without blocking)
446 fun poll_in: Bool is abstract
447
448 end
449
450 # A `Stream` that can be written to
451 abstract class Writer
452 super Stream
453
454 # Writes bytes from `s`
455 fun write_bytes(s: Bytes) is abstract
456
457 # write a string
458 fun write(s: Text) is abstract
459
460 # Write a single byte
461 fun write_byte(value: Byte) is abstract
462
463 # Writes a single char
464 fun write_char(c: Char) do write(c.to_s)
465
466 # Can the stream be used to write
467 fun is_writable: Bool is abstract
468 end
469
470 # Things that can be efficienlty written to a `Writer`
471 #
472 # The point of this interface is to allow the instance to be efficiently
473 # written into a `Writer`.
474 #
475 # Ready-to-save documents usually provide this interface.
476 interface Writable
477 # Write itself to a `stream`
478 # The specific logic it let to the concrete subclasses
479 fun write_to(stream: Writer) is abstract
480
481 # Like `write_to` but return a new String (may be quite large)
482 #
483 # This funtionality is anectodical, since the point
484 # of streamable object to to be efficienlty written to a
485 # stream without having to allocate and concatenate strings
486 fun write_to_string: String
487 do
488 var stream = new StringWriter
489 write_to(stream)
490 return stream.to_s
491 end
492 end
493
494 redef class Bytes
495 super Writable
496 redef fun write_to(s) do s.write_bytes(self)
497
498 redef fun write_to_string do return to_s
499 end
500
501 redef class Text
502 super Writable
503 redef fun write_to(stream) do stream.write(self)
504 end
505
506 # Input streams with a buffered input for efficiency purposes
507 abstract class BufferedReader
508 super Reader
509 redef fun read_char
510 do
511 if last_error != null then return null
512 if eof then
513 last_error = new IOError("Stream has reached eof")
514 return null
515 end
516 # TODO: Fix when supporting UTF-8
517 var c = _buffer[_buffer_pos].to_i.code_point
518 _buffer_pos += 1
519 return c
520 end
521
522 redef fun read_byte
523 do
524 if last_error != null then return -1
525 if eof then
526 last_error = new IOError("Stream has reached eof")
527 return -1
528 end
529 var c = _buffer[_buffer_pos]
530 _buffer_pos += 1
531 return c.to_i
532 end
533
534 # Resets the internal buffer
535 fun buffer_reset do
536 _buffer_length = 0
537 _buffer_pos = 0
538 end
539
540 # Peeks up to `n` bytes in the buffer
541 #
542 # The operation does not consume the buffer
543 #
544 # ~~~nitish
545 # var x = new FileReader.open("File.txt")
546 # assert x.peek(5) == x.read(5)
547 # ~~~
548 fun peek(i: Int): Bytes do
549 if eof then return new Bytes.empty
550 var remsp = _buffer_length - _buffer_pos
551 if i <= remsp then
552 var bf = new Bytes.with_capacity(i)
553 bf.append_ns_from(_buffer, i, _buffer_pos)
554 return bf
555 end
556 var bf = new Bytes.with_capacity(i)
557 bf.append_ns_from(_buffer, remsp, _buffer_pos)
558 _buffer_pos = _buffer_length
559 read_intern(i - bf.length, bf)
560 remsp = _buffer_length - _buffer_pos
561 var full_len = bf.length + remsp
562 if full_len > _buffer_capacity then
563 var c = _buffer_capacity
564 while c < full_len do c = c * 2 + 2
565 _buffer_capacity = c
566 end
567 var nns = new CString(_buffer_capacity)
568 bf.items.copy_to(nns, bf.length, 0, 0)
569 _buffer.copy_to(nns, remsp, _buffer_pos, bf.length)
570 _buffer = nns
571 _buffer_pos = 0
572 _buffer_length = full_len
573 return bf
574 end
575
576 redef fun read_bytes(i)
577 do
578 if last_error != null then return new Bytes.empty
579 var buf = new Bytes.with_capacity(i)
580 read_intern(i, buf)
581 return buf
582 end
583
584 # Fills `buf` with at most `i` bytes read from `self`
585 private fun read_intern(i: Int, buf: Bytes): Int do
586 if eof then return 0
587 var p = _buffer_pos
588 var bufsp = _buffer_length - p
589 if bufsp >= i then
590 _buffer_pos += i
591 buf.append_ns_from(_buffer, i, p)
592 return i
593 end
594 _buffer_pos = _buffer_length
595 var readln = _buffer_length - p
596 buf.append_ns_from(_buffer, readln, p)
597 var rd = read_intern(i - readln, buf)
598 return rd + readln
599 end
600
601 redef fun read_all_bytes
602 do
603 if last_error != null then return new Bytes.empty
604 var s = new Bytes.with_capacity(10)
605 var b = _buffer
606 while not eof do
607 var j = _buffer_pos
608 var k = _buffer_length
609 var rd_sz = k - j
610 s.append_ns_from(b, rd_sz, j)
611 _buffer_pos = k
612 fill_buffer
613 end
614 return s
615 end
616
617 redef fun append_line_to(s)
618 do
619 var lb = new Bytes.with_capacity(10)
620 loop
621 # First phase: look for a '\n'
622 var i = _buffer_pos
623 while i < _buffer_length and _buffer[i] != 0xAu8 do
624 i += 1
625 end
626
627 var eol
628 if i < _buffer_length then
629 assert _buffer[i] == 0xAu8
630 i += 1
631 eol = true
632 else
633 eol = false
634 end
635
636 # if there is something to append
637 if i > _buffer_pos then
638 # Copy from the buffer to the string
639 var j = _buffer_pos
640 while j < i do
641 lb.add(_buffer[j])
642 j += 1
643 end
644 _buffer_pos = i
645 else
646 assert end_reached
647 s.append lb.to_s
648 return
649 end
650
651 if eol then
652 # so \n is found
653 s.append lb.to_s
654 return
655 else
656 # so \n is not found
657 if end_reached then
658 s.append lb.to_s
659 return
660 end
661 fill_buffer
662 end
663 end
664 end
665
666 redef fun eof
667 do
668 if _buffer_pos < _buffer_length then return false
669 if end_reached then return true
670 fill_buffer
671 return _buffer_pos >= _buffer_length and end_reached
672 end
673
674 # The buffer
675 private var buffer: CString = new CString(0)
676
677 # The current position in the buffer
678 private var buffer_pos = 0
679
680 # Length of the current buffer (i.e. nuber of bytes in the buffer)
681 private var buffer_length = 0
682
683 # Capacity of the buffer
684 private var buffer_capacity = 0
685
686 # Fill the buffer
687 protected fun fill_buffer is abstract
688
689 # Has the last fill_buffer reached the end
690 protected fun end_reached: Bool is abstract
691
692 # Allocate a `_buffer` for a given `capacity`.
693 protected fun prepare_buffer(capacity: Int)
694 do
695 _buffer = new CString(capacity)
696 _buffer_pos = 0 # need to read
697 _buffer_length = 0
698 _buffer_capacity = capacity
699 end
700 end
701
702 # A `Stream` that can be written to and read from
703 abstract class Duplex
704 super Reader
705 super Writer
706 end
707
708 # Write to `bytes` in memory
709 #
710 # ~~~
711 # var writer = new BytesWriter
712 #
713 # writer.write "Strings "
714 # writer.write_char '&'
715 # writer.write_byte 0x20u8
716 # writer.write_bytes "bytes".to_bytes
717 #
718 # assert writer.to_s == "\\x53\\x74\\x72\\x69\\x6E\\x67\\x73\\x20\\x26\\x20\\x62\\x79\\x74\\x65\\x73"
719 # assert writer.bytes.to_s == "Strings & bytes"
720 # ~~~
721 #
722 # As with any binary data, UTF-8 code points encoded on two bytes or more
723 # can be constructed byte by byte.
724 #
725 # ~~~
726 # writer = new BytesWriter
727 #
728 # # Write just the character first half
729 # writer.write_byte 0xC2u8
730 # assert writer.to_s == "\\xC2"
731 # assert writer.bytes.to_s == "�"
732 #
733 # # Complete the character
734 # writer.write_byte 0xA2u8
735 # assert writer.to_s == "\\xC2\\xA2"
736 # assert writer.bytes.to_s == "¢"
737 # ~~~
738 class BytesWriter
739 super Writer
740
741 # Written memory
742 var bytes = new Bytes.empty
743
744 redef fun to_s do return bytes.chexdigest
745
746 redef fun write(str)
747 do
748 if closed then return
749 str.append_to_bytes bytes
750 end
751
752 redef fun write_char(c)
753 do
754 if closed then return
755 bytes.add_char c
756 end
757
758 redef fun write_byte(value)
759 do
760 if closed then return
761 bytes.add value
762 end
763
764 redef fun write_bytes(b)
765 do
766 if closed then return
767 bytes.append b
768 end
769
770 # Is the stream closed?
771 protected var closed = false
772
773 redef fun close do closed = true
774 redef fun is_writable do return not closed
775 end
776
777 # `Stream` writing to a `String`
778 #
779 # This class has the same behavior as `BytesWriter`
780 # except for `to_s` which decodes `bytes` to a string.
781 #
782 # ~~~
783 # var writer = new StringWriter
784 #
785 # writer.write "Strings "
786 # writer.write_char '&'
787 # writer.write_byte 0x20u8
788 # writer.write_bytes "bytes".to_bytes
789 #
790 # assert writer.to_s == "Strings & bytes"
791 # ~~~
792 class StringWriter
793 super BytesWriter
794
795 redef fun to_s do return bytes.to_s
796 end
797
798 # Read from `bytes` in memory
799 #
800 # ~~~
801 # var reader = new BytesReader(b"a…b")
802 # assert reader.read_char == 'a'
803 # assert reader.read_byte == 0xE2 # 1st byte of '…'
804 # assert reader.read_byte == 0x80 # 2nd byte of '…'
805 # assert reader.read_char == '�' # Reads the last byte as an invalid char
806 # assert reader.read_all_bytes == b"b"
807 # ~~~
808 class BytesReader
809 super Reader
810
811 # Source data to read
812 var bytes: Bytes
813
814 # The current position in `bytes`
815 private var cursor = 0
816
817 redef fun read_char
818 do
819 if cursor >= bytes.length then return null
820
821 var len = bytes.items.length_of_char_at(cursor)
822 var char = bytes.items.char_at(cursor)
823 cursor += len
824 return char
825 end
826
827 redef fun read_byte
828 do
829 if cursor >= bytes.length then return -1
830
831 var c = bytes[cursor]
832 cursor += 1
833 return c.to_i
834 end
835
836 redef fun close do bytes = new Bytes.empty
837
838 redef fun read_all_bytes
839 do
840 var res = bytes.slice_from(cursor)
841 cursor = bytes.length
842 return res
843 end
844
845 redef fun eof do return cursor >= bytes.length
846 end
847
848 # `Stream` reading from a `String` source
849 #
850 # This class has the same behavior as `BytesReader`
851 # except for its constructor accepting a `String`.
852 #
853 # ~~~
854 # var reader = new StringReader("a…b")
855 # assert reader.read_char == 'a'
856 # assert reader.read_byte == 0xE2 # 1st byte of '…'
857 # assert reader.read_byte == 0x80 # 2nd byte of '…'
858 # assert reader.read_char == '�' # Reads the last byte as an invalid char
859 # assert reader.read_all == "b"
860 # ~~~
861 class StringReader
862 super BytesReader
863
864 autoinit source
865
866 # Source data to read
867 var source: String
868
869 init do bytes = source.to_bytes
870
871 redef fun close
872 do
873 source = ""
874 super
875 end
876 end