X-Git-Url: http://nitlanguage.org

diff --git a/lib/core/stream.nit b/lib/core/stream.nit
index 799d130..b0486ef 100644
--- a/lib/core/stream.nit
+++ b/lib/core/stream.nit
@@ -11,7 +11,6 @@
 # Input and output streams of characters
 module stream
 
-intrude import text::ropes
 import error
 intrude import bytes
 import codecs
@@ -29,6 +28,51 @@ end
 
 # Any kind of stream to read/write/both to or from a source
 abstract class Stream
+	# Codec used to transform raw data to text
+	#
+	# Note: defaults to UTF-8
+	var codec: Codec = utf8_codec is protected writable(set_codec)
+
+	# Lookahead buffer for codecs
+	#
+	# Since some codecs are multibyte, a lookahead may be required
+	# to store the next bytes and consume them only if a valid character
+	# is read.
+	protected var lookahead: CString is noinit
+
+	# Capacity of the lookahead
+	protected var lookahead_capacity = 0
+
+	# Current occupation of the lookahead
+	protected var lookahead_length = 0
+
+	# Buffer for writing data to a stream
+	protected var write_buffer: CString is noinit
+
+	init do
+		var lcap = codec.max_lookahead
+		lookahead = new CString(lcap)
+		write_buffer = new CString(lcap)
+		lookahead_length = 0
+		lookahead_capacity = lcap
+	end
+
+	# Change the codec for this stream.
+	fun codec=(c: Codec) do
+		if c.max_lookahead > lookahead_capacity then
+			var lcap = codec.max_lookahead
+			var lk = new CString(lcap)
+			var llen = lookahead_length
+			if llen > 0 then
+				lookahead.copy_to(lk, llen, 0, 0)
+			end
+			lookahead = lk
+			lookahead_capacity = lcap
+			write_buffer = new CString(lcap)
+		end
+		set_codec(c)
+	end
+
 	# Error produced by the file stream
 	#
 	#     var ifs = new FileReader.open("donotmakethisfile.binx")
@@ -39,38 +83,167 @@ abstract class Stream
 
 	# close the stream
 	fun close is abstract
+
+	# Pre-work hook.
+	#
+	# Used to inform `self` that operations will start.
+	# Specific streams can use this to prepare some resources.
+	#
+	# Is automatically invoked at the beginning of `with` structures.
+	#
+	# Do nothing by default.
+	fun start do end
+
+	# Post-work hook.
+	#
+	# Used to inform `self` that the operations are over.
+	# Specific streams can use this to free some resources.
+	#
+	# Is automatically invoked at the end of `with` structures.
+	#
+	# call `close` by default.
+	fun finish do close
 end
 
 # A `Stream` that can be read from
 abstract class Reader
 	super Stream
 
-	# Decoder used to transform input bytes to UTF-8
-	var decoder: Decoder = utf8_decoder is writable
+	# Read a byte directly from the underlying stream, without
+	# considering any eventual buffer
+	protected fun raw_read_byte: Int is abstract
+
+	# Read at most `max` bytes from the underlying stream into `buf`,
+	# without considering any eventual buffer
+	#
+	# Returns how many bytes were read
+	protected fun raw_read_bytes(buf: CString, max: Int): Int do
+		var rd = 0
+		for i in [0 .. max[ do
+			var b = raw_read_byte
+			if b < 0 then break
+			buf[i] = b
+			rd += 1
+		end
+		return rd
+	end
 
 	# Reads a character. Returns `null` on EOF or timeout
-	fun read_char: nullable Char is abstract
+	#
+	# Returns unicode replacement character 'ï¿½' if an
+	# invalid byte sequence is read.
+	#
+	# `read_char` may block if:
+	#
+	# * No byte could be read from the current buffer
+	# * An incomplete char is partially read, and more bytes are
+	#   required for full decoding.
+	fun read_char: nullable Char do
+		if eof then return null
+		var cod = codec
+		var codet_sz = cod.codet_size
+		var lk = lookahead
+		var llen = lookahead_length
+		if llen < codet_sz then
+			llen += raw_read_bytes(lk.fast_cstring(llen), codet_sz - llen)
+		end
+		if llen < codet_sz then
+			lookahead_length = 0
+			return 0xFFFD.code_point
+		end
+		var ret = cod.is_valid_char(lk, codet_sz)
+		var max_llen = cod.max_lookahead
+		while ret == 1 and llen < max_llen do
+			var rd = raw_read_bytes(lk.fast_cstring(llen), codet_sz)
+			if rd < codet_sz then
+				llen -= codet_sz
+				if llen > 0 then
+					lookahead.lshift(codet_sz, llen, codet_sz)
+				end
+				lookahead_length = llen.max(0)
+				return 0xFFFD.code_point
+			end
+			llen += codet_sz
+			ret = cod.is_valid_char(lk, llen)
+		end
+		if ret == 0 then
+			var c = cod.decode_char(lk)
+			var clen = c.u8char_len
+			llen -= clen
+			if llen > 0 then
+				lookahead.lshift(clen, llen, clen)
+			end
+			lookahead_length = llen
+			return c
+		end
+		if ret == 2 or ret == 1 then
+			llen -= codet_sz
+			if llen > 0 then
+				lookahead.lshift(codet_sz, llen, codet_sz)
+			end
+			lookahead_length = llen
+			return 0xFFFD.code_point
+		end
+		# Should not happen if the decoder works properly
+		var arr = new Array[Object]
+		arr.push "Decoder error: could not decode nor recover from byte sequence ["
+		for i in [0 .. llen[ do
+			arr.push lk[i]
+			arr.push ", "
+		end
+		arr.push "]"
+		var err = new IOError(arr.plain_to_s)
+		err.cause = last_error
+		last_error = err
+		return 0xFFFD.code_point
+	end
 
-	# Reads a byte. Returns `null` on EOF or timeout
-	fun read_byte: nullable Byte is abstract
+	# Reads a byte. Returns a negative value on error
+	fun read_byte: Int do
+		var llen = lookahead_length
+		if llen == 0 then return raw_read_byte
+		var lk = lookahead
+		var b = lk[0].to_i
+		if llen == 1 then
+			lookahead_length = 0
+		else
+			lk.lshift(1, llen - 1, 1)
+			lookahead_length -= 1
+		end
+		return b
+	end
 
 	# Reads a String of at most `i` length
-	fun read(i: Int): String do return read_bytes(i).to_s
+	fun read(i: Int): String do
+		assert i >= 0
+		var cs = new CString(i)
+		var rd = read_bytes_to_cstring(cs, i)
+		if rd < 0 then return ""
+		return codec.decode_string(cs, rd)
+	end
 
-	# Read at most i bytes
-	fun read_bytes(i: Int): Bytes
-	do
-		if last_error != null then return new Bytes.empty
-		var s = new NativeString(i)
-		var buf = new Bytes(s, 0, 0)
-		while i > 0 and not eof do
-			var c = read_byte
-			if c != null then
-				buf.add c
-				i -= 1
-			end
+	# Reads up to `max` bytes from source
+	fun read_bytes(max: Int): Bytes do
+		assert max >= 0
+		var cs = new CString(max)
+		var rd = read_bytes_to_cstring(cs, max)
+		return new Bytes(cs, rd, max)
+	end
+
+	# Reads up to `max` bytes from source and stores them in `bytes`
+	fun read_bytes_to_cstring(bytes: CString, max: Int): Int do
+		var llen = lookahead_length
+		if llen == 0 then return raw_read_bytes(bytes, max)
+		var rd = max.min(llen)
+		var lk = lookahead
+		lk.copy_to(bytes, rd, 0, 0)
+		if rd < llen then
+			lk.lshift(rd, llen - rd, rd)
+			lookahead_length -= rd
+		else
+			lookahead_length = 0
 		end
-		return buf
+		return rd + raw_read_bytes(bytes.fast_cstring(rd), max - rd)
 	end
 
 	# Read a string until the end of the line.
@@ -175,29 +348,7 @@ abstract class Reader
 		var s = read_all_bytes
 		var slen = s.length
 		if slen == 0 then return ""
-		var rets = ""
-		var pos = 0
-		var str = s.items.clean_utf8(slen)
-		slen = str.bytelen
-		var sits = str.items
-		var remsp = slen
-		while pos < slen do
-			# The 129 size was decided more or less arbitrarily
-			# It will require some more benchmarking to compute
-			# if this is the best size or not
-			var chunksz = 129
-			if chunksz > remsp then
-				rets += new FlatString.with_infos(sits, remsp, pos, pos + remsp - 1)
-				break
-			end
-			var st = sits.find_beginning_of_char_at(pos + chunksz - 1)
-			var bytelen = st - pos
-			rets += new FlatString.with_infos(sits, bytelen, pos, st - 1)
-			pos = st
-			remsp -= bytelen
-		end
-		if rets isa Concat then return rets.balance
-		return rets
+		return codec.decode_string(s.items, s.length)
 	end
 
 	# Read all the stream until the eof.
@@ -207,9 +358,10 @@ abstract class Reader
 	do
 		if last_error != null then return new Bytes.empty
 		var s = new Bytes.empty
+		var buf = new CString(4096)
 		while not eof do
-			var c = read_byte
-			if c != null then s.add(c)
+			var rd = read_bytes_to_cstring(buf, 4096)
+			s.append_ns(buf, rd)
 		end
 		return s
 	end
@@ -263,7 +415,11 @@ abstract class Reader
 
 	# Is there something to read.
 	# This function returns 'false' if there is something to read.
-	fun eof: Bool is abstract
+	fun eof: Bool do
+		if lookahead_length > 0 then return false
+		lookahead_length = raw_read_bytes(lookahead, 1)
+		return lookahead_length <= 0
+	end
 
 	# Read the next sequence of non whitespace characters.
 	#
@@ -328,37 +484,18 @@ end
 # Iterator returned by `Reader::each_line`.
 # See the aforementioned method for details.
 class LineIterator
-	super Iterator[String]
+	super CachedIterator[String]
 
 	# The original stream
 	var stream: Reader
 
-	redef fun is_ok
+	redef fun next_item
 	do
-		var res = not stream.eof
-		if not res and close_on_finish then stream.close
-		return res
-	end
-
-	redef fun item
-	do
-		var line = self.line
-		if line == null then
-			line = stream.read_line
+		if stream.eof then
+			if close_on_finish then stream.close
+			return null
 		end
-		self.line = line
-		return line
-	end
-
-	# The last line read (cache)
-	private var line: nullable String = null
-
-	redef fun next
-	do
-		# force the read
-		if line == null then item
-		# drop the line
-		line = null
+		return stream.read_line
 	end
 
 	# Close the stream when the stream is at the EOF.
@@ -385,17 +522,23 @@ end
 abstract class Writer
 	super Stream
 
-	# The coder from a nit UTF-8 String to the output file
-	var coder: Coder = utf8_coder is writable
+	# Write bytes from `s`
+	fun write_bytes(s: Bytes) do write_bytes_from_cstring(s.items, s.length)
 
-	# Writes bytes from `s`
-	fun write_bytes(s: Bytes) is abstract
+	# Write `len` bytes from `ns`
+	fun write_bytes_from_cstring(ns: CString, len: Int) is abstract
 
-	# write a string
+	# Write a string
 	fun write(s: Text) is abstract
 
 	# Write a single byte
-	fun write_byte(value: Byte) is abstract
+	fun write_byte(value: Int) is abstract
+
+	# Write a single char
+	fun write_char(c: Char) do
+		var ln = codec.add_char_to(c, write_buffer)
+		write_bytes_from_cstring(write_buffer, ln)
+	end
 
 	# Can the stream be used to write
 	fun is_writable: Bool is abstract
@@ -412,17 +555,39 @@ interface Writable
 	# The specific logic it let to the concrete subclasses
 	fun write_to(stream: Writer) is abstract
 
-	# Like `write_to` but return a new String (may be quite large)
+	# Like `write_to` but return a new String (may be quite large).
 	#
-	# This funtionality is anectodical, since the point
-	# of streamable object to to be efficienlty written to a
-	# stream without having to allocate and concatenate strings
+	# This functionality is anecdotal, since the point
+	# of a streamable object is to be efficiently written to a
+	# stream without having to allocate and concatenate strings.
 	fun write_to_string: String
 	do
 		var stream = new StringWriter
 		write_to(stream)
 		return stream.to_s
 	end
+
+	# Like `write_to` but return a new Bytes (may be quite large)
+	#
+	# This functionality is anecdotal, since the point
+	# of a streamable object is to be efficiently written to a
+	# stream without having to allocate and concatenate buffers.
+	#
+	# Nevertheless, you might need this method if you want to know
+	# the byte size of a writable object.
+	fun write_to_bytes: Bytes
+	do
+		var stream = new BytesWriter
+		write_to(stream)
+		return stream.bytes
+	end
+end
+
+redef class Bytes
+	super Writable
+	redef fun write_to(s) do s.write_bytes(self)
+
+	redef fun write_to_string do return to_s
 end
 
 redef class Text
@@ -430,277 +595,176 @@ redef class Text
 	redef fun write_to(stream) do stream.write(self)
 end
 
-# Input streams with a buffered input for efficiency purposes
-abstract class BufferedReader
+# A `Stream` that can be written to and read from
+abstract class Duplex
 	super Reader
-	redef fun read_char
-	do
-		if last_error != null then return null
-		if eof then
-			last_error = new IOError("Stream has reached eof")
-			return null
-		end
-		# TODO: Fix when supporting UTF-8
-		var c = _buffer[_buffer_pos].to_i.ascii
-		_buffer_pos += 1
-		return c
-	end
+	super Writer
+end
 
-	redef fun read_byte
-	do
-		if last_error != null then return null
-		if eof then
-			last_error = new IOError("Stream has reached eof")
-			return null
-		end
-		var c = _buffer[_buffer_pos]
-		_buffer_pos += 1
-		return c
-	end
+# Write to `bytes` in memory
+#
+# ~~~
+# var writer = new BytesWriter
+#
+# writer.write "Strings "
+# writer.write_char '&'
+# writer.write_byte 0x20
+# writer.write_bytes "bytes".to_bytes
+#
+# assert writer.to_s == "\\x53\\x74\\x72\\x69\\x6E\\x67\\x73\\x20\\x26\\x20\\x62\\x79\\x74\\x65\\x73"
+# assert writer.bytes.to_s == "Strings & bytes"
+# ~~~
+#
+# As with any binary data, UTF-8 code points encoded on two bytes or more
+# can be constructed byte by byte.
+#
+# ~~~
+# writer = new BytesWriter
+#
+# # Write just the character first half
+# writer.write_byte 0xC2
+# assert writer.to_s == "\\xC2"
+# assert writer.bytes.to_s == "ï¿½"
+#
+# # Complete the character
+# writer.write_byte 0xA2
+# assert writer.to_s == "\\xC2\\xA2"
+# assert writer.bytes.to_s == "Â¢"
+# ~~~
+class BytesWriter
+	super Writer
 
-	# Resets the internal buffer
-	fun buffer_reset do
-		_buffer_length = 0
-		_buffer_pos = 0
-	end
+	# Written memory
+	var bytes = new Bytes.empty
 
-	# Peeks up to `n` bytes in the buffer
-	#
-	# The operation does not consume the buffer
-	#
-	# ~~~nitish
-	# var x = new FileReader.open("File.txt")
-	# assert x.peek(5) == x.read(5)
-	# ~~~
-	fun peek(i: Int): Bytes do
-		if eof then return new Bytes.empty
-		var remsp = _buffer_length - _buffer_pos
-		if i <= remsp then
-			var bf = new Bytes.with_capacity(i)
-			bf.append_ns_from(_buffer, i, _buffer_pos)
-			return bf
-		end
-		var bf = new Bytes.with_capacity(i)
-		bf.append_ns_from(_buffer, remsp, _buffer_pos)
-		_buffer_pos = _buffer_length
-		read_intern(i - bf.length, bf)
-		remsp = _buffer_length - _buffer_pos
-		var full_len = bf.length + remsp
-		if full_len > _buffer_capacity then
-			var c = _buffer_capacity
-			while c < full_len do c = c * 2 + 2
-			_buffer_capacity = c
-		end
-		var nns = new NativeString(_buffer_capacity)
-		bf.items.copy_to(nns, bf.length, 0, 0)
-		_buffer.copy_to(nns, remsp, _buffer_pos, bf.length)
-		_buffer = nns
-		_buffer_pos = 0
-		_buffer_length = full_len
-		return bf
-	end
+	redef fun to_s do return bytes.chexdigest
 
-	redef fun read_bytes(i)
+	redef fun write(str)
 	do
-		if last_error != null then return new Bytes.empty
-		var buf = new Bytes.with_capacity(i)
-		read_intern(i, buf)
-		return buf
+		if closed then return
+		str.append_to_bytes bytes
 	end
 
-	# Fills `buf` with at most `i` bytes read from `self`
-	private fun read_intern(i: Int, buf: Bytes): Int do
-		if eof then return 0
-		var p = _buffer_pos
-		var bufsp = _buffer_length - p
-		if bufsp >= i then
-			_buffer_pos += i
-			buf.append_ns_from(_buffer, i, p)
-			return i
-		end
-		_buffer_pos = _buffer_length
-		var readln = _buffer_length - p
-		buf.append_ns_from(_buffer, readln, p)
-		var rd = read_intern(i - readln, buf)
-		return rd + readln
-	end
-
-	redef fun read_all_bytes
+	redef fun write_char(c)
 	do
-		if last_error != null then return new Bytes.empty
-		var s = new Bytes.with_capacity(10)
-		var b = _buffer
-		while not eof do
-			var j = _buffer_pos
-			var k = _buffer_length
-			var rd_sz = k - j
-			s.append_ns_from(b, rd_sz, j)
-			_buffer_pos = k
-			fill_buffer
-		end
-		return s
+		if closed then return
+		bytes.add_char c
 	end
 
-	redef fun append_line_to(s)
+	redef fun write_byte(value)
 	do
-		var lb = new Bytes.with_capacity(10)
-		loop
-			# First phase: look for a '\n'
-			var i = _buffer_pos
-			while i < _buffer_length and _buffer[i] != 0xAu8 do
-				i += 1
-			end
-
-			var eol
-			if i < _buffer_length then
-				assert _buffer[i] == 0xAu8
-				i += 1
-				eol = true
-			else
-				eol = false
-			end
-
-			# if there is something to append
-			if i > _buffer_pos then
-				# Copy from the buffer to the string
-				var j = _buffer_pos
-				while j < i do
-					lb.add(_buffer[j])
-					j += 1
-				end
-				_buffer_pos = i
-			else
-				assert end_reached
-				s.append lb.to_s
-				return
-			end
-
-			if eol then
-				# so \n is found
-				s.append lb.to_s
-				return
-			else
-				# so \n is not found
-				if end_reached then
-					s.append lb.to_s
-					return
-				end
-				fill_buffer
-			end
-		end
+		if closed then return
+		bytes.add value
 	end
 
-	redef fun eof
-	do
-		if _buffer_pos < _buffer_length then return false
-		if end_reached then return true
-		fill_buffer
-		return _buffer_pos >= _buffer_length and end_reached
+	redef fun write_bytes_from_cstring(ns, len) do
+		if closed then return
+		bytes.append_ns(ns, len)
 	end
 
-	# The buffer
-	private var buffer: NativeString = new NativeString(0)
-
-	# The current position in the buffer
-	private var buffer_pos = 0
-
-	# Length of the current buffer (i.e. nuber of bytes in the buffer)
-	private var buffer_length = 0
-
-	# Capacity of the buffer
-	private var buffer_capacity = 0
+	# Is the stream closed?
+	protected var closed = false
 
-	# Fill the buffer
-	protected fun fill_buffer is abstract
+	redef fun close do closed = true
+	redef fun is_writable do return not closed
+end
 
-	# Has the last fill_buffer reached the end
-	protected fun end_reached: Bool is abstract
+# `Stream` writing to a `String`
+#
+# This class has the same behavior as `BytesWriter`
+# except for `to_s` which decodes `bytes` to a string.
+#
+# ~~~
+# var writer = new StringWriter
+#
+# writer.write "Strings "
+# writer.write_char '&'
+# writer.write_byte 0x20
+# writer.write_bytes "bytes".to_bytes
+#
+# assert writer.to_s == "Strings & bytes"
+# ~~~
+class StringWriter
+	super BytesWriter
 
-	# Allocate a `_buffer` for a given `capacity`.
-	protected fun prepare_buffer(capacity: Int)
-	do
-		_buffer = new NativeString(capacity)
-		_buffer_pos = 0 # need to read
-		_buffer_length = 0
-		_buffer_capacity = capacity
-	end
+	redef fun to_s do return bytes.to_s
 end
 
-# A `Stream` that can be written to and read from
-abstract class Duplex
+# Read from `bytes` in memory
+#
+# ~~~
+# var reader = new BytesReader(b"aâ¦b")
+# assert reader.read_char == 'a'
+# assert reader.read_byte == 0xE2 # 1st byte of 'â¦'
+# assert reader.read_byte == 0x80 # 2nd byte of 'â¦'
+# assert reader.read_char == 'ï¿½' # Reads the last byte as an invalid char
+# assert reader.read_all_bytes == b"b"
+# ~~~
+class BytesReader
 	super Reader
-	super Writer
-end
 
-# `Stream` that can be used to write to a `String`
-#
-# Mainly used for compatibility with Writer type and tests.
-class StringWriter
-	super Writer
+	# Source data to read
+	var bytes: Bytes
 
-	private var content = new Array[String]
-	redef fun to_s do return content.plain_to_s
-	redef fun is_writable do return not closed
+	# The current position in `bytes`
+	private var cursor = 0
 
-	redef fun write_bytes(b) do
-		content.add(b.to_s)
+	redef fun raw_read_byte
+	do
+		if cursor >= bytes.length then return -1
+
+		var c = bytes[cursor]
+		cursor += 1
+		return c.to_i
 	end
 
-	redef fun write(str)
+	redef fun close do bytes = new Bytes.empty
+
+	redef fun read_all_bytes
 	do
-		assert not closed
-		content.add(str.to_s)
+		var res = bytes.slice_from(cursor)
+		cursor = bytes.length
+		return res
 	end
 
-	# Is the stream closed?
-	protected var closed = false
+	redef fun raw_read_bytes(ns, max) do
+		if cursor >= bytes.length then return 0
 
-	redef fun close do closed = true
+		var copy = max.min(bytes.length - cursor)
+		bytes.items.copy_to(ns, copy, cursor, 0)
+		cursor += copy
+		return copy
+	end
+
+	redef fun eof do return cursor >= bytes.length
 end
 
-# `Stream` used to read from a `String`
+# `Stream` reading from a `String` source
 #
-# Mainly used for compatibility with Reader type and tests.
+# This class has the same behavior as `BytesReader`
+# except for its constructor accepting a `String`.
+#
+# ~~~
+# var reader = new StringReader("aâ¦b")
+# assert reader.read_char == 'a'
+# assert reader.read_byte == 0xE2 # 1st byte of 'â¦'
+# assert reader.read_byte == 0x80 # 2nd byte of 'â¦'
+# assert reader.read_char == 'ï¿½' # Reads the last byte as an invalid char
+# assert reader.read_all == "b"
+# ~~~
 class StringReader
-	super Reader
+	super BytesReader
 
-	# The string to read from.
-	var source: String
+	autoinit source
 
-	# The current position in the string (bytewise).
-	private var cursor: Int = 0
-
-	redef fun read_char do
-		if cursor < source.length then
-			# Fix when supporting UTF-8
-			var c = source[cursor]
-			cursor += 1
-			return c
-		else
-			return null
-		end
-	end
+	# Source data to read
+	var source: String
 
-	redef fun read_byte do
-		if cursor < source.length then
-			var c = source.bytes[cursor]
-			cursor += 1
-			return c
-		else
-			return null
-		end
-	end
+	init do bytes = source.to_bytes
 
-	redef fun close do
+	redef fun close
+	do
 		source = ""
+		super
 	end
-
-	redef fun read_all_bytes do
-		var nslen = source.length - cursor
-		var nns = new NativeString(nslen)
-		source.copy_to_native(nns, nslen, cursor, 0)
-		return new Bytes(nns, nslen, nslen)
-	end
-
-	redef fun eof do return cursor >= source.bytelen
 end