152c294a86969b5fdc5d57607bec8ab738665eae
[nit.git] / lib / core / stream.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # This file is free software, which comes along with NIT. This software is
4 # distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
5 # without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
6 # PARTICULAR PURPOSE. You can modify it is you want, provided this header
7 # is kept unaltered, and a notification of the changes is added.
8 # You are allowed to redistribute it and sell it, alone or is a part of
9 # another product.
10
11 # Input and output streams of characters
12 module stream
13
14 import error
15 intrude import bytes
16 import codecs
17
18 in "C" `{
19 #include <unistd.h>
20 #include <string.h>
21 #include <signal.h>
22 `}
23
24 # Any kind of error that could be produced by an operation on Streams
25 class IOError
26 super Error
27 end
28
29 # Any kind of stream to read/write/both to or from a source
30 abstract class Stream
31 # Codec used to transform raw data to text
32 #
33 # Note: defaults to UTF-8
34 var codec: Codec = utf8_codec is protected writable(set_codec)
35
36 # Lookahead buffer for codecs
37 #
38 # Since some codecs are multibyte, a lookahead may be required
39 # to store the next bytes and consume them only if a valid character
40 # is read.
41 protected var lookahead: CString is noinit
42
43 # Capacity of the lookahead
44 protected var lookahead_capacity = 0
45
46 # Current occupation of the lookahead
47 protected var lookahead_length = 0
48
49 # Buffer for writing data to a stream
50 protected var write_buffer: CString is noinit
51
52 init do
53 var lcap = codec.max_lookahead
54 lookahead = new CString(lcap)
55 write_buffer = new CString(lcap)
56 lookahead_length = 0
57 lookahead_capacity = lcap
58 end
59
60 # Change the codec for this stream.
61 fun codec=(c: Codec) do
62 if c.max_lookahead > lookahead_capacity then
63 var lcap = codec.max_lookahead
64 var lk = new CString(lcap)
65 var llen = lookahead_length
66 if llen > 0 then
67 lookahead.copy_to(lk, llen, 0, 0)
68 end
69 lookahead = lk
70 lookahead_capacity = lcap
71 write_buffer = new CString(lcap)
72 end
73 set_codec(c)
74 end
75
76 # Error produced by the file stream
77 #
78 # var ifs = new FileReader.open("donotmakethisfile.binx")
79 # ifs.read_all
80 # ifs.close
81 # assert ifs.last_error != null
82 var last_error: nullable IOError = null
83
84 # close the stream
85 fun close is abstract
86
87 # Pre-work hook.
88 #
89 # Used to inform `self` that operations will start.
90 # Specific streams can use this to prepare some resources.
91 #
92 # Is automatically invoked at the beginning of `with` structures.
93 #
94 # Do nothing by default.
95 fun start do end
96
97 # Post-work hook.
98 #
99 # Used to inform `self` that the operations are over.
100 # Specific streams can use this to free some resources.
101 #
102 # Is automatically invoked at the end of `with` structures.
103 #
104 # call `close` by default.
105 fun finish do close
106 end
107
108 # A `Stream` that can be read from
109 abstract class Reader
110 super Stream
111
112 # Read a byte directly from the underlying stream, without
113 # considering any eventual buffer
114 protected fun raw_read_byte: Int is abstract
115
116 # Read at most `max` bytes from the underlying stream into `buf`,
117 # without considering any eventual buffer
118 #
119 # Returns how many bytes were read
120 protected fun raw_read_bytes(buf: CString, max: Int): Int do
121 var rd = 0
122 for i in [0 .. max[ do
123 var b = raw_read_byte
124 if b < 0 then break
125 buf[i] = b
126 rd += 1
127 end
128 return rd
129 end
130
131 # Reads a character. Returns `null` on EOF or timeout
132 #
133 # Returns unicode replacement character '�' if an
134 # invalid byte sequence is read.
135 #
136 # `read_char` may block if:
137 #
138 # * No byte could be read from the current buffer
139 # * An incomplete char is partially read, and more bytes are
140 # required for full decoding.
141 fun read_char: nullable Char do
142 if eof then return null
143 var cod = codec
144 var codet_sz = cod.codet_size
145 var lk = lookahead
146 var llen = lookahead_length
147 if llen < codet_sz then
148 llen += raw_read_bytes(lk.fast_cstring(llen), codet_sz - llen)
149 end
150 if llen < codet_sz then
151 lookahead_length = 0
152 return 0xFFFD.code_point
153 end
154 var ret = cod.is_valid_char(lk, codet_sz)
155 var max_llen = cod.max_lookahead
156 while ret == 1 and llen < max_llen do
157 var rd = raw_read_bytes(lk.fast_cstring(llen), codet_sz)
158 if rd < codet_sz then
159 llen -= codet_sz
160 if llen > 0 then
161 lookahead.lshift(codet_sz, llen, codet_sz)
162 end
163 lookahead_length = llen.max(0)
164 return 0xFFFD.code_point
165 end
166 llen += codet_sz
167 ret = cod.is_valid_char(lk, llen)
168 end
169 if ret == 0 then
170 var c = cod.decode_char(lk)
171 var clen = c.u8char_len
172 llen -= clen
173 if llen > 0 then
174 lookahead.lshift(clen, llen, clen)
175 end
176 lookahead_length = llen
177 return c
178 end
179 if ret == 2 or ret == 1 then
180 llen -= codet_sz
181 if llen > 0 then
182 lookahead.lshift(codet_sz, llen, codet_sz)
183 end
184 lookahead_length = llen
185 return 0xFFFD.code_point
186 end
187 # Should not happen if the decoder works properly
188 var arr = new Array[Object]
189 arr.push "Decoder error: could not decode nor recover from byte sequence ["
190 for i in [0 .. llen[ do
191 arr.push lk[i]
192 arr.push ", "
193 end
194 arr.push "]"
195 var err = new IOError(arr.plain_to_s)
196 err.cause = last_error
197 last_error = err
198 return 0xFFFD.code_point
199 end
200
201 # Reads a byte. Returns a negative value on error
202 fun read_byte: Int do
203 var llen = lookahead_length
204 if llen == 0 then return raw_read_byte
205 var lk = lookahead
206 var b = lk[0].to_i
207 if llen == 1 then
208 lookahead_length = 0
209 else
210 lk.lshift(1, llen - 1, 1)
211 lookahead_length -= 1
212 end
213 return b
214 end
215
216 # Reads a String of at most `i` length
217 fun read(i: Int): String do
218 assert i >= 0
219 var cs = new CString(i)
220 var rd = read_bytes_to_cstring(cs, i)
221 if rd < 0 then return ""
222 return codec.decode_string(cs, rd)
223 end
224
225 # Reads up to `max` bytes from source
226 fun read_bytes(max: Int): Bytes do
227 assert max >= 0
228 var cs = new CString(max)
229 var rd = read_bytes_to_cstring(cs, max)
230 return new Bytes(cs, rd, max)
231 end
232
233 # Reads up to `max` bytes from source and stores them in `bytes`
234 fun read_bytes_to_cstring(bytes: CString, max: Int): Int do
235 var llen = lookahead_length
236 if llen == 0 then return raw_read_bytes(bytes, max)
237 var rd = max.min(llen)
238 var lk = lookahead
239 lk.copy_to(bytes, rd, 0, 0)
240 if rd < llen then
241 lk.lshift(rd, llen - rd, rd)
242 lookahead_length -= rd
243 else
244 lookahead_length = 0
245 end
246 return rd + raw_read_bytes(bytes.fast_cstring(rd), max - rd)
247 end
248
249 # Read a string until the end of the line.
250 #
251 # The line terminator '\n' and '\r\n', if any, is removed in each line.
252 #
253 # ~~~
254 # var txt = "Hello\n\nWorld\n"
255 # var i = new StringReader(txt)
256 # assert i.read_line == "Hello"
257 # assert i.read_line == ""
258 # assert i.read_line == "World"
259 # assert i.eof
260 # ~~~
261 #
262 # Only LINE FEED (`\n`), CARRIAGE RETURN & LINE FEED (`\r\n`), and
263 # the end or file (EOF) is considered to delimit the end of lines.
264 # CARRIAGE RETURN (`\r`) alone is not used for the end of line.
265 #
266 # ~~~
267 # var txt2 = "Hello\r\n\n\rWorld"
268 # var i2 = new StringReader(txt2)
269 # assert i2.read_line == "Hello"
270 # assert i2.read_line == ""
271 # assert i2.read_line == "\rWorld"
272 # assert i2.eof
273 # ~~~
274 #
275 # NOTE: Use `append_line_to` if the line terminator needs to be preserved.
276 fun read_line: String
277 do
278 if last_error != null then return ""
279 if eof then return ""
280 var s = new FlatBuffer
281 append_line_to(s)
282 return s.to_s.chomp
283 end
284
285 # Read all the lines until the eof.
286 #
287 # The line terminator '\n' and `\r\n` is removed in each line,
288 #
289 # ~~~
290 # var txt = "Hello\n\nWorld\n"
291 # var i = new StringReader(txt)
292 # assert i.read_lines == ["Hello", "", "World"]
293 # ~~~
294 #
295 # This method is more efficient that splitting
296 # the result of `read_all`.
297 #
298 # NOTE: SEE `read_line` for details.
299 fun read_lines: Array[String]
300 do
301 var res = new Array[String]
302 while not eof do
303 res.add read_line
304 end
305 return res
306 end
307
308 # Return an iterator that read each line.
309 #
310 # The line terminator '\n' and `\r\n` is removed in each line,
311 # The line are read with `read_line`. See this method for details.
312 #
313 # ~~~
314 # var txt = "Hello\n\nWorld\n"
315 # var i = new StringReader(txt)
316 # assert i.each_line.to_a == ["Hello", "", "World"]
317 # ~~~
318 #
319 # Unlike `read_lines` that read all lines at the call, `each_line` is lazy.
320 # Therefore, the stream should no be closed until the end of the stream.
321 #
322 # ~~~
323 # i = new StringReader(txt)
324 # var el = i.each_line
325 #
326 # assert el.item == "Hello"
327 # el.next
328 # assert el.item == ""
329 # el.next
330 #
331 # i.close
332 #
333 # assert not el.is_ok
334 # # closed before "world" is read
335 # ~~~
336 fun each_line: LineIterator do return new LineIterator(self)
337
338 # Read all the stream until the eof.
339 #
340 # The content of the file is returned as a String.
341 #
342 # ~~~
343 # var txt = "Hello\n\nWorld\n"
344 # var i = new StringReader(txt)
345 # assert i.read_all == txt
346 # ~~~
347 fun read_all: String do
348 var s = read_all_bytes
349 var slen = s.length
350 if slen == 0 then return ""
351 return codec.decode_string(s.items, s.length)
352 end
353
354 # Read all the stream until the eof.
355 #
356 # The content of the file is returned verbatim.
357 fun read_all_bytes: Bytes
358 do
359 if last_error != null then return new Bytes.empty
360 var s = new Bytes.empty
361 var buf = new CString(4096)
362 while not eof do
363 var rd = read_bytes_to_cstring(buf, 4096)
364 s.append_ns(buf, rd)
365 end
366 return s
367 end
368
369 # Read a string until the end of the line and append it to `s`.
370 #
371 # Unlike `read_line` and other related methods,
372 # the line terminator '\n', if any, is preserved in each line.
373 # Use the method `Text::chomp` to safely remove it.
374 #
375 # ~~~
376 # var txt = "Hello\n\nWorld\n"
377 # var i = new StringReader(txt)
378 # var b = new FlatBuffer
379 # i.append_line_to(b)
380 # assert b == "Hello\n"
381 # i.append_line_to(b)
382 # assert b == "Hello\n\n"
383 # i.append_line_to(b)
384 # assert b == txt
385 # assert i.eof
386 # ~~~
387 #
388 # If `\n` is not present at the end of the result, it means that
389 # a non-eol terminated last line was returned.
390 #
391 # ~~~
392 # var i2 = new StringReader("hello")
393 # assert not i2.eof
394 # var b2 = new FlatBuffer
395 # i2.append_line_to(b2)
396 # assert b2 == "hello"
397 # assert i2.eof
398 # ~~~
399 #
400 # NOTE: The single character LINE FEED (`\n`) delimits the end of lines.
401 # Therefore CARRIAGE RETURN & LINE FEED (`\r\n`) is also recognized.
402 fun append_line_to(s: Buffer)
403 do
404 if last_error != null then return
405 loop
406 var x = read_char
407 if x == null then
408 if eof then return
409 else
410 s.chars.push(x)
411 if x == '\n' then return
412 end
413 end
414 end
415
416 # Is there something to read.
417 # This function returns 'false' if there is something to read.
418 fun eof: Bool do
419 if lookahead_length > 0 then return false
420 lookahead_length = raw_read_bytes(lookahead, 1)
421 return lookahead_length <= 0
422 end
423
424 # Read the next sequence of non whitespace characters.
425 #
426 # Leading whitespace characters are skipped.
427 # The first whitespace character that follows the result is consumed.
428 #
429 # An empty string is returned if the end of the file or an error is encounter.
430 #
431 # ~~~
432 # var w = new StringReader(" Hello, \n\t World!")
433 # assert w.read_word == "Hello,"
434 # assert w.read_char == '\n'
435 # assert w.read_word == "World!"
436 # assert w.read_word == ""
437 # ~~~
438 #
439 # `Char::is_whitespace` determines what is a whitespace.
440 fun read_word: String
441 do
442 var buf = new FlatBuffer
443 var c = read_nonwhitespace
444 if c != null then
445 buf.add(c)
446 while not eof do
447 c = read_char
448 if c == null then break
449 if c.is_whitespace then break
450 buf.add(c)
451 end
452 end
453 var res = buf.to_s
454 return res
455 end
456
457 # Skip whitespace characters (if any) then return the following non-whitespace character.
458 #
459 # Returns the code point of the character.
460 # Returns `null` on end of file or error.
461 #
462 # In fact, this method works like `read_char` except it skips whitespace.
463 #
464 # ~~~
465 # var w = new StringReader(" \nab\tc")
466 # assert w.read_nonwhitespace == 'a'
467 # assert w.read_nonwhitespace == 'b'
468 # assert w.read_nonwhitespace == 'c'
469 # assert w.read_nonwhitespace == null
470 # ~~~
471 #
472 # `Char::is_whitespace` determines what is a whitespace.
473 fun read_nonwhitespace: nullable Char
474 do
475 var c: nullable Char = null
476 while not eof do
477 c = read_char
478 if c == null or not c.is_whitespace then break
479 end
480 return c
481 end
482 end
483
484 # Iterator returned by `Reader::each_line`.
485 # See the aforementioned method for details.
486 class LineIterator
487 super Iterator[String]
488
489 # The original stream
490 var stream: Reader
491
492 redef fun is_ok
493 do
494 var res = not stream.eof
495 if not res and close_on_finish then stream.close
496 return res
497 end
498
499 redef fun item
500 do
501 var line = self.line
502 if line == null then
503 line = stream.read_line
504 end
505 self.line = line
506 return line
507 end
508
509 # The last line read (cache)
510 private var line: nullable String = null
511
512 redef fun next
513 do
514 # force the read
515 if line == null then item
516 # drop the line
517 line = null
518 end
519
520 # Close the stream when the stream is at the EOF.
521 #
522 # Default is false.
523 var close_on_finish = false is writable
524
525 redef fun finish
526 do
527 if close_on_finish then stream.close
528 end
529 end
530
531 # `Reader` capable of declaring if readable without blocking
532 abstract class PollableReader
533 super Reader
534
535 # Is there something to read? (without blocking)
536 fun poll_in: Bool is abstract
537
538 end
539
540 # A `Stream` that can be written to
541 abstract class Writer
542 super Stream
543
544 # Write bytes from `s`
545 fun write_bytes(s: Bytes) do write_bytes_from_cstring(s.items, s.length)
546
547 # Write `len` bytes from `ns`
548 fun write_bytes_from_cstring(ns: CString, len: Int) is abstract
549
550 # Write a string
551 fun write(s: Text) is abstract
552
553 # Write a single byte
554 fun write_byte(value: Int) is abstract
555
556 # Write a single char
557 fun write_char(c: Char) do
558 var ln = codec.add_char_to(c, write_buffer)
559 write_bytes_from_cstring(write_buffer, ln)
560 end
561
562 # Can the stream be used to write
563 fun is_writable: Bool is abstract
564 end
565
566 # Things that can be efficienlty written to a `Writer`
567 #
568 # The point of this interface is to allow the instance to be efficiently
569 # written into a `Writer`.
570 #
571 # Ready-to-save documents usually provide this interface.
572 interface Writable
573 # Write itself to a `stream`
574 # The specific logic it let to the concrete subclasses
575 fun write_to(stream: Writer) is abstract
576
577 # Like `write_to` but return a new String (may be quite large).
578 #
579 # This functionality is anecdotal, since the point
580 # of a streamable object is to be efficiently written to a
581 # stream without having to allocate and concatenate strings.
582 fun write_to_string: String
583 do
584 var stream = new StringWriter
585 write_to(stream)
586 return stream.to_s
587 end
588
589 # Like `write_to` but return a new Bytes (may be quite large)
590 #
591 # This functionality is anecdotal, since the point
592 # of a streamable object is to be efficiently written to a
593 # stream without having to allocate and concatenate buffers.
594 #
595 # Nevertheless, you might need this method if you want to know
596 # the byte size of a writable object.
597 fun write_to_bytes: Bytes
598 do
599 var stream = new BytesWriter
600 write_to(stream)
601 return stream.bytes
602 end
603 end
604
605 redef class Bytes
606 super Writable
607 redef fun write_to(s) do s.write_bytes(self)
608
609 redef fun write_to_string do return to_s
610 end
611
612 redef class Text
613 super Writable
614 redef fun write_to(stream) do stream.write(self)
615 end
616
617 # A `Stream` that can be written to and read from
618 abstract class Duplex
619 super Reader
620 super Writer
621 end
622
623 # Write to `bytes` in memory
624 #
625 # ~~~
626 # var writer = new BytesWriter
627 #
628 # writer.write "Strings "
629 # writer.write_char '&'
630 # writer.write_byte 0x20
631 # writer.write_bytes "bytes".to_bytes
632 #
633 # assert writer.to_s == "\\x53\\x74\\x72\\x69\\x6E\\x67\\x73\\x20\\x26\\x20\\x62\\x79\\x74\\x65\\x73"
634 # assert writer.bytes.to_s == "Strings & bytes"
635 # ~~~
636 #
637 # As with any binary data, UTF-8 code points encoded on two bytes or more
638 # can be constructed byte by byte.
639 #
640 # ~~~
641 # writer = new BytesWriter
642 #
643 # # Write just the character first half
644 # writer.write_byte 0xC2
645 # assert writer.to_s == "\\xC2"
646 # assert writer.bytes.to_s == "�"
647 #
648 # # Complete the character
649 # writer.write_byte 0xA2
650 # assert writer.to_s == "\\xC2\\xA2"
651 # assert writer.bytes.to_s == "¢"
652 # ~~~
653 class BytesWriter
654 super Writer
655
656 # Written memory
657 var bytes = new Bytes.empty
658
659 redef fun to_s do return bytes.chexdigest
660
661 redef fun write(str)
662 do
663 if closed then return
664 str.append_to_bytes bytes
665 end
666
667 redef fun write_char(c)
668 do
669 if closed then return
670 bytes.add_char c
671 end
672
673 redef fun write_byte(value)
674 do
675 if closed then return
676 bytes.add value
677 end
678
679 redef fun write_bytes_from_cstring(ns, len) do
680 if closed then return
681 bytes.append_ns(ns, len)
682 end
683
684 # Is the stream closed?
685 protected var closed = false
686
687 redef fun close do closed = true
688 redef fun is_writable do return not closed
689 end
690
691 # `Stream` writing to a `String`
692 #
693 # This class has the same behavior as `BytesWriter`
694 # except for `to_s` which decodes `bytes` to a string.
695 #
696 # ~~~
697 # var writer = new StringWriter
698 #
699 # writer.write "Strings "
700 # writer.write_char '&'
701 # writer.write_byte 0x20
702 # writer.write_bytes "bytes".to_bytes
703 #
704 # assert writer.to_s == "Strings & bytes"
705 # ~~~
706 class StringWriter
707 super BytesWriter
708
709 redef fun to_s do return bytes.to_s
710 end
711
712 # Read from `bytes` in memory
713 #
714 # ~~~
715 # var reader = new BytesReader(b"a…b")
716 # assert reader.read_char == 'a'
717 # assert reader.read_byte == 0xE2 # 1st byte of '…'
718 # assert reader.read_byte == 0x80 # 2nd byte of '…'
719 # assert reader.read_char == '�' # Reads the last byte as an invalid char
720 # assert reader.read_all_bytes == b"b"
721 # ~~~
722 class BytesReader
723 super Reader
724
725 # Source data to read
726 var bytes: Bytes
727
728 # The current position in `bytes`
729 private var cursor = 0
730
731 redef fun raw_read_byte
732 do
733 if cursor >= bytes.length then return -1
734
735 var c = bytes[cursor]
736 cursor += 1
737 return c.to_i
738 end
739
740 redef fun close do bytes = new Bytes.empty
741
742 redef fun read_all_bytes
743 do
744 var res = bytes.slice_from(cursor)
745 cursor = bytes.length
746 return res
747 end
748
749 redef fun raw_read_bytes(ns, max) do
750 if cursor >= bytes.length then return 0
751
752 var copy = max.min(bytes.length - cursor)
753 bytes.items.copy_to(ns, copy, cursor, 0)
754 cursor += copy
755 return copy
756 end
757
758 redef fun eof do return cursor >= bytes.length
759 end
760
761 # `Stream` reading from a `String` source
762 #
763 # This class has the same behavior as `BytesReader`
764 # except for its constructor accepting a `String`.
765 #
766 # ~~~
767 # var reader = new StringReader("a…b")
768 # assert reader.read_char == 'a'
769 # assert reader.read_byte == 0xE2 # 1st byte of '…'
770 # assert reader.read_byte == 0x80 # 2nd byte of '…'
771 # assert reader.read_char == '�' # Reads the last byte as an invalid char
772 # assert reader.read_all == "b"
773 # ~~~
774 class StringReader
775 super BytesReader
776
777 autoinit source
778
779 # Source data to read
780 var source: String
781
782 init do bytes = source.to_bytes
783
784 redef fun close
785 do
786 source = ""
787 super
788 end
789 end