lib/core: fix Reader::read
[nit.git] / lib / core / stream.nit
index a54c7e8..9c73b1d 100644 (file)
@@ -11,7 +11,6 @@
 # Input and output streams of characters
 module stream
 
-intrude import text::ropes
 import error
 intrude import bytes
 import codecs
@@ -29,6 +28,51 @@ end
 
 # Any kind of stream to read/write/both to or from a source
 abstract class Stream
+       # Codec used to transform raw data to text
+       #
+       # Note: defaults to UTF-8
+       var codec: Codec = utf8_codec is protected writable(set_codec)
+
+       # Lookahead buffer for codecs
+       #
+       # Since some codecs are multibyte, a lookahead may be required
+       # to store the next bytes and consume them only if a valid character
+       # is read.
+       protected var lookahead: CString is noinit
+
+       # Capacity of the lookahead
+       protected var lookahead_capacity = 0
+
+       # Current occupation of the lookahead
+       protected var lookahead_length = 0
+
+       # Buffer for writing data to a stream
+       protected var write_buffer: CString is noinit
+
+       init do
+               var lcap = codec.max_lookahead
+               lookahead = new CString(lcap)
+               write_buffer = new CString(lcap)
+               lookahead_length = 0
+               lookahead_capacity = lcap
+       end
+
+       # Change the codec for this stream.
+       fun codec=(c: Codec) do
+               if c.max_lookahead > lookahead_capacity then
+                       var lcap = codec.max_lookahead
+                       var lk = new CString(lcap)
+                       var llen = lookahead_length
+                       if llen > 0 then
+                               lookahead.copy_to(lk, llen, 0, 0)
+                       end
+                       lookahead = lk
+                       lookahead_capacity = lcap
+                       write_buffer = new CString(lcap)
+               end
+               set_codec(c)
+       end
+
        # Error produced by the file stream
        #
        #     var ifs = new FileReader.open("donotmakethisfile.binx")
@@ -55,7 +99,7 @@ abstract class Stream
        # Used to inform `self` that the operations are over.
        # Specific streams can use this to free some resources.
        #
-       # Is automatically invoked at the end of `woth` structures.
+       # Is automatically invoked at the end of `with` structures.
        #
        # call `close` by default.
        fun finish do close
@@ -65,32 +109,74 @@ end
 abstract class Reader
        super Stream
 
-       # Decoder used to transform input bytes to UTF-8
-       var decoder: Codec = utf8_codec is writable
+       # Read a byte directly from the underlying stream, without
+       # considering any eventual buffer
+       protected fun raw_read_byte: Int is abstract
+
+       # Read at most `max` bytes from the underlying stream into `buf`,
+       # without considering any eventual buffer
+       #
+       # Returns how many bytes were read
+       protected fun raw_read_bytes(buf: CString, max: Int): Int do
+               var rd = 0
+               for i in [0 .. max[ do
+                       var b = raw_read_byte
+                       if b < 0 then break
+                       buf[i] = b.to_b
+                       rd += 1
+               end
+               return rd
+       end
 
        # Reads a character. Returns `null` on EOF or timeout
        fun read_char: nullable Char is abstract
 
-       # Reads a byte. Returns `null` on EOF or timeout
-       fun read_byte: nullable Byte is abstract
+       # Reads a byte. Returns a negative value on error
+       fun read_byte: Int do
+               var llen = lookahead_length
+               if llen == 0 then return raw_read_byte
+               var lk = lookahead
+               var b = lk[0].to_i
+               if llen == 1 then
+                       lookahead_length = 0
+               else
+                       lk.lshift(1, llen - 1, 1)
+                       lookahead_length -= 1
+               end
+               return b
+       end
 
        # Reads a String of at most `i` length
-       fun read(i: Int): String do return read_bytes(i).to_s
+       fun read(i: Int): String do
+               assert i >= 0
+               var cs = new CString(i)
+               var rd = read_bytes_to_cstring(cs, i)
+               if rd < 0 then return ""
+               return codec.decode_string(cs, rd)
+       end
 
-       # Read at most i bytes
-       fun read_bytes(i: Int): Bytes
-       do
-               if last_error != null then return new Bytes.empty
-               var s = new NativeString(i)
-               var buf = new Bytes(s, 0, 0)
-               while i > 0 and not eof do
-                       var c = read_byte
-                       if c != null then
-                               buf.add c
-                               i -= 1
-                       end
+       # Reads up to `max` bytes from source
+       fun read_bytes(max: Int): Bytes do
+               assert max >= 0
+               var cs = new CString(max)
+               var rd = read_bytes_to_cstring(cs, max)
+               return new Bytes(cs, rd, max)
+       end
+
+       # Reads up to `max` bytes from source and stores them in `bytes`
+       fun read_bytes_to_cstring(bytes: CString, max: Int): Int do
+               var llen = lookahead_length
+               if llen == 0 then return raw_read_bytes(bytes, max)
+               var rd = max.min(llen)
+               var lk = lookahead
+               lk.copy_to(bytes, rd, 0, 0)
+               if rd < llen then
+                       lk.lshift(rd, llen - rd, rd)
+                       lookahead_length -= rd
+               else
+                       lookahead_length = 0
                end
-               return buf
+               return rd + raw_read_bytes(bytes, max - rd)
        end
 
        # Read a string until the end of the line.
@@ -195,29 +281,7 @@ abstract class Reader
                var s = read_all_bytes
                var slen = s.length
                if slen == 0 then return ""
-               var rets = ""
-               var pos = 0
-               var str = s.items.clean_utf8(slen)
-               slen = str.byte_length
-               var sits = str.items
-               var remsp = slen
-               while pos < slen do
-                       # The 129 size was decided more or less arbitrarily
-                       # It will require some more benchmarking to compute
-                       # if this is the best size or not
-                       var chunksz = 129
-                       if chunksz > remsp then
-                               rets += new FlatString.with_infos(sits, remsp, pos)
-                               break
-                       end
-                       var st = sits.find_beginning_of_char_at(pos + chunksz - 1)
-                       var byte_length = st - pos
-                       rets += new FlatString.with_infos(sits, byte_length, pos)
-                       pos = st
-                       remsp -= byte_length
-               end
-               if rets isa Concat then return rets.balance
-               return rets
+               return codec.decode_string(s.items, s.length)
        end
 
        # Read all the stream until the eof.
@@ -227,9 +291,10 @@ abstract class Reader
        do
                if last_error != null then return new Bytes.empty
                var s = new Bytes.empty
+               var buf = new CString(4096)
                while not eof do
-                       var c = read_byte
-                       if c != null then s.add(c)
+                       var rd = read_bytes_to_cstring(buf, 4096)
+                       s.append_ns(buf, rd)
                end
                return s
        end
@@ -405,20 +470,23 @@ end
 abstract class Writer
        super Stream
 
-       # The coder from a nit UTF-8 String to the output file
-       var coder: Codec = utf8_codec is writable
+       # Write bytes from `s`
+       fun write_bytes(s: Bytes) do write_bytes_from_cstring(s.items, s.length)
 
-       # Writes bytes from `s`
-       fun write_bytes(s: Bytes) is abstract
+       # Write `len` bytes from `ns`
+       fun write_bytes_from_cstring(ns: CString, len: Int) is abstract
 
-       # write a string
+       # Write a string
        fun write(s: Text) is abstract
 
        # Write a single byte
        fun write_byte(value: Byte) is abstract
 
-       # Writes a single char
-       fun write_char(c: Char) do write(c.to_s)
+       # Write a single char
+       fun write_char(c: Char) do
+               var ln = codec.add_char_to(c, write_buffer)
+               write_bytes_from_cstring(write_buffer, ln)
+       end
 
        # Can the stream be used to write
        fun is_writable: Bool is abstract
@@ -478,14 +546,14 @@ abstract class BufferedReader
 
        redef fun read_byte
        do
-               if last_error != null then return null
+               if last_error != null then return -1
                if eof then
                        last_error = new IOError("Stream has reached eof")
-                       return null
+                       return -1
                end
                var c = _buffer[_buffer_pos]
                _buffer_pos += 1
-               return c
+               return c.to_i
        end
 
        # Resets the internal buffer
@@ -521,7 +589,7 @@ abstract class BufferedReader
                        while c < full_len do c = c * 2 + 2
                        _buffer_capacity = c
                end
-               var nns = new NativeString(_buffer_capacity)
+               var nns = new CString(_buffer_capacity)
                bf.items.copy_to(nns, bf.length, 0, 0)
                _buffer.copy_to(nns, remsp, _buffer_pos, bf.length)
                _buffer = nns
@@ -530,12 +598,11 @@ abstract class BufferedReader
                return bf
        end
 
-       redef fun read_bytes(i)
+       redef fun read_bytes_to_cstring(buf, i)
        do
-               if last_error != null then return new Bytes.empty
-               var buf = new Bytes.with_capacity(i)
-               read_intern(i, buf)
-               return buf
+               if last_error != null then return 0
+               var bbf = new Bytes(buf, 0, i)
+               return read_intern(i, bbf)
        end
 
        # Fills `buf` with at most `i` bytes read from `self`
@@ -629,7 +696,7 @@ abstract class BufferedReader
        end
 
        # The buffer
-       private var buffer: NativeString = new NativeString(0)
+       private var buffer: CString = new CString(0)
 
        # The current position in the buffer
        private var buffer_pos = 0
@@ -649,7 +716,7 @@ abstract class BufferedReader
        # Allocate a `_buffer` for a given `capacity`.
        protected fun prepare_buffer(capacity: Int)
        do
-               _buffer = new NativeString(capacity)
+               _buffer = new CString(capacity)
                _buffer_pos = 0 # need to read
                _buffer_length = 0
                _buffer_capacity = capacity
@@ -662,75 +729,180 @@ abstract class Duplex
        super Writer
 end
 
-# `Stream` that can be used to write to a `String`
+# Write to `bytes` in memory
 #
-# Mainly used for compatibility with Writer type and tests.
-class StringWriter
+# ~~~
+# var writer = new BytesWriter
+#
+# writer.write "Strings "
+# writer.write_char '&'
+# writer.write_byte 0x20u8
+# writer.write_bytes "bytes".to_bytes
+#
+# assert writer.to_s == "\\x53\\x74\\x72\\x69\\x6E\\x67\\x73\\x20\\x26\\x20\\x62\\x79\\x74\\x65\\x73"
+# assert writer.bytes.to_s == "Strings & bytes"
+# ~~~
+#
+# As with any binary data, UTF-8 code points encoded on two bytes or more
+# can be constructed byte by byte.
+#
+# ~~~
+# writer = new BytesWriter
+#
+# # Write just the character first half
+# writer.write_byte 0xC2u8
+# assert writer.to_s == "\\xC2"
+# assert writer.bytes.to_s == "�"
+#
+# # Complete the character
+# writer.write_byte 0xA2u8
+# assert writer.to_s == "\\xC2\\xA2"
+# assert writer.bytes.to_s == "¢"
+# ~~~
+class BytesWriter
        super Writer
 
-       private var content = new Array[String]
-       redef fun to_s do return content.plain_to_s
-       redef fun is_writable do return not closed
+       # Written memory
+       var bytes = new Bytes.empty
 
-       redef fun write_bytes(b) do
-               content.add(b.to_s)
-       end
+       redef fun to_s do return bytes.chexdigest
 
        redef fun write(str)
        do
-               assert not closed
-               content.add(str.to_s)
+               if closed then return
+               str.append_to_bytes bytes
+       end
+
+       redef fun write_char(c)
+       do
+               if closed then return
+               bytes.add_char c
+       end
+
+       redef fun write_byte(value)
+       do
+               if closed then return
+               bytes.add value
+       end
+
+       redef fun write_bytes_from_cstring(ns, len) do
+               if closed then return
+               bytes.append_ns(ns, len)
        end
 
        # Is the stream closed?
        protected var closed = false
 
        redef fun close do closed = true
+       redef fun is_writable do return not closed
 end
 
-# `Stream` used to read from a `String`
+# `Stream` writing to a `String`
 #
-# Mainly used for compatibility with Reader type and tests.
-class StringReader
+# This class has the same behavior as `BytesWriter`
+# except for `to_s` which decodes `bytes` to a string.
+#
+# ~~~
+# var writer = new StringWriter
+#
+# writer.write "Strings "
+# writer.write_char '&'
+# writer.write_byte 0x20u8
+# writer.write_bytes "bytes".to_bytes
+#
+# assert writer.to_s == "Strings & bytes"
+# ~~~
+class StringWriter
+       super BytesWriter
+
+       redef fun to_s do return bytes.to_s
+end
+
+# Read from `bytes` in memory
+#
+# ~~~
+# var reader = new BytesReader(b"a…b")
+# assert reader.read_char == 'a'
+# assert reader.read_byte == 0xE2 # 1st byte of '…'
+# assert reader.read_byte == 0x80 # 2nd byte of '…'
+# assert reader.read_char == '�' # Reads the last byte as an invalid char
+# assert reader.read_all_bytes == b"b"
+# ~~~
+class BytesReader
        super Reader
 
-       # The string to read from.
-       var source: String
+       # Source data to read
+       var bytes: Bytes
 
-       # The current position in the string (bytewise).
-       private var cursor: Int = 0
+       # The current position in `bytes`
+       private var cursor = 0
 
-       redef fun read_char do
-               if cursor < source.length then
-                       # Fix when supporting UTF-8
-                       var c = source[cursor]
-                       cursor += 1
-                       return c
-               else
-                       return null
-               end
+       redef fun read_char
+       do
+               if cursor >= bytes.length then return null
+
+               var len = bytes.items.length_of_char_at(cursor)
+               var char = bytes.items.char_at(cursor)
+               cursor += len
+               return char
        end
 
-       redef fun read_byte do
-               if cursor < source.length then
-                       var c = source.bytes[cursor]
-                       cursor += 1
-                       return c
-               else
-                       return null
-               end
+       redef fun read_byte
+       do
+               if cursor >= bytes.length then return -1
+
+               var c = bytes[cursor]
+               cursor += 1
+               return c.to_i
        end
 
-       redef fun close do
-               source = ""
+       redef fun close do bytes = new Bytes.empty
+
+       redef fun read_all_bytes
+       do
+               var res = bytes.slice_from(cursor)
+               cursor = bytes.length
+               return res
        end
 
-       redef fun read_all_bytes do
-               var nslen = source.length - cursor
-               var nns = new NativeString(nslen)
-               source.copy_to_native(nns, nslen, cursor, 0)
-               return new Bytes(nns, nslen, nslen)
+       redef fun raw_read_bytes(ns, max) do
+               if cursor >= bytes.length then return 0
+
+               var copy = max.min(bytes.length - cursor)
+               bytes.items.copy_to(ns, copy, cursor, 0)
+               cursor += copy
+               return copy
        end
 
-       redef fun eof do return cursor >= source.byte_length
+       redef fun eof do return cursor >= bytes.length
+end
+
+# `Stream` reading from a `String` source
+#
+# This class has the same behavior as `BytesReader`
+# except for its constructor accepting a `String`.
+#
+# ~~~
+# var reader = new StringReader("a…b")
+# assert reader.read_char == 'a'
+# assert reader.read_byte == 0xE2 # 1st byte of '…'
+# assert reader.read_byte == 0x80 # 2nd byte of '…'
+# assert reader.read_char == '�' # Reads the last byte as an invalid char
+# assert reader.read_all == "b"
+# ~~~
+class StringReader
+       super BytesReader
+
+       autoinit source
+
+       # Source data to read
+       var source: String
+
+       init do bytes = source.to_bytes
+
+       redef fun close
+       do
+               source = ""
+               super
+       end
 end