lib/core: provide default codec-aware read_char
authorLucas Bajolet <lucas.bajolet@gmail.com>
Mon, 7 May 2018 14:45:21 +0000 (10:45 -0400)
committerLucas Bajolet <lucas.bajolet@gmail.com>
Thu, 10 May 2018 20:48:30 +0000 (16:48 -0400)
Previous implementations of read_char were unaware of codec issues, and
used to read a byte and convert it to a code point.

For ASCII characters this was enough, but once unicode characters were
read on a char-by-char basis, wrong characters would appear.

This commit fixes this issue by using the Codec API to read a character
intelligently, and properly support multibyte encodings.

Signed-off-by: Lucas Bajolet <lucas.bajolet@gmail.com>

lib/core/stream.nit
tests/sav/test_read_char.res [new file with mode: 0644]
tests/sav/test_read_char_alt1.res [new file with mode: 0644]
tests/test_read_char.nit [new file with mode: 0644]

index d884b62..5447b94 100644 (file)
@@ -129,7 +129,74 @@ abstract class Reader
        end
 
        # Reads a character. Returns `null` on EOF or timeout
-       fun read_char: nullable Char is abstract
+       #
+       # Returns unicode replacement character '�' if an
+       # invalid byte sequence is read.
+       #
+       # `read_char` may block if:
+       #
+       # * No byte could be read from the current buffer
+       # * An incomplete char is partially read, and more bytes are
+       #   required for full decoding.
+       fun read_char: nullable Char do
+               if eof then return null
+               var cod = codec
+               var codet_sz = cod.codet_size
+               var lk = lookahead
+               var llen = lookahead_length
+               if llen < codet_sz then
+                       llen += raw_read_bytes(lk.fast_cstring(llen), codet_sz - llen)
+               end
+               if llen < codet_sz then
+                       lookahead_length = 0
+                       return 0xFFFD.code_point
+               end
+               var ret = cod.is_valid_char(lk, codet_sz)
+               var max_llen = cod.max_lookahead
+               while ret == 1 and llen < max_llen do
+                       var rd = raw_read_bytes(lk.fast_cstring(llen), codet_sz)
+                       if rd < codet_sz then
+                               llen -= codet_sz
+                               if llen > 0 then
+                                       lookahead.lshift(codet_sz, llen, codet_sz)
+                               end
+                               lookahead_length = llen.max(0)
+                               return 0xFFFD.code_point
+                       end
+                       llen += codet_sz
+                       ret = cod.is_valid_char(lk, llen)
+               end
+               if ret == 0 then
+                       var c = cod.decode_char(lk)
+                       var clen = c.u8char_len
+                       llen -= clen
+                       if llen > 0 then
+                               lookahead.lshift(clen, llen, clen)
+                       end
+                       lookahead_length = llen
+                       return c
+               end
+               if ret == 2 or ret == 1 then
+                       llen -= codet_sz
+                       if llen > 0 then
+                               lookahead.lshift(codet_sz, llen, codet_sz)
+                       end
+                       lookahead_length = llen
+                       return 0xFFFD.code_point
+               end
+               # Should not happen if the decoder works properly
+               var arr = new Array[Object]
+               arr.push "Decoder error: could not decode nor recover from byte sequence ["
+               for i in [0 .. llen[ do
+                       arr.push lk[i]
+                       arr.push ", "
+               end
+               arr.push "]"
+               var err = new IOError(arr.plain_to_s)
+               err.cause = last_error
+               last_error = err
+               return 0xFFFD.code_point
+       end
 
        # Reads a byte. Returns a negative value on error
        fun read_byte: Int do
@@ -528,20 +595,8 @@ end
 # Input streams with a buffered input for efficiency purposes
 abstract class BufferedReader
        super Reader
-       redef fun read_char
-       do
-               if last_error != null then return null
-               if eof then
-                       last_error = new IOError("Stream has reached eof")
-                       return null
-               end
-               # TODO: Fix when supporting UTF-8
-               var c = _buffer[_buffer_pos].to_i.code_point
-               _buffer_pos += 1
-               return c
-       end
 
-       redef fun read_byte
+       redef fun raw_read_byte
        do
                if last_error != null then return -1
                if eof then
@@ -834,17 +889,7 @@ class BytesReader
        # The current position in `bytes`
        private var cursor = 0
 
-       redef fun read_char
-       do
-               if cursor >= bytes.length then return null
-
-               var len = bytes.items.length_of_char_at(cursor)
-               var char = bytes.items.char_at(cursor)
-               cursor += len
-               return char
-       end
-
-       redef fun read_byte
+       redef fun raw_read_byte
        do
                if cursor >= bytes.length then return -1
 
diff --git a/tests/sav/test_read_char.res b/tests/sav/test_read_char.res
new file mode 100644 (file)
index 0000000..602107c
--- /dev/null
@@ -0,0 +1,16 @@
+五
+a
+十
+音
+d
+図
+:
+f
+サ
+行
+j
+ア
+k
+段
diff --git a/tests/sav/test_read_char_alt1.res b/tests/sav/test_read_char_alt1.res
new file mode 100644 (file)
index 0000000..5e84276
--- /dev/null
@@ -0,0 +1,19 @@
+\94
+\81
+\9f
+d
+f
+j
diff --git a/tests/test_read_char.nit b/tests/test_read_char.nit
new file mode 100644 (file)
index 0000000..1ec61f0
--- /dev/null
@@ -0,0 +1,23 @@
+# This file is part of NIT ( http://www.nitlanguage.org ).
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+var s = "五a十音d図: fサ行j アk段"
+var in_stream = new BytesReader(s.to_bytes)
+#alt1 in_stream.codec = iso88591_codec
+var c = in_stream.read_char
+
+while c != null do
+       print c
+       c = in_stream.read_char
+end