Merge: Intro Codec
authorJean Privat <jean@pryen.org>
Thu, 20 Aug 2015 14:45:44 +0000 (10:45 -0400)
committerJean Privat <jean@pryen.org>
Thu, 20 Aug 2015 14:45:44 +0000 (10:45 -0400)
As UTF-8 is now part of Nit, the standard imposes conforming implementations to properly handle borderline cases like overlong sequences and such.

The codec defined here sanitizes an input before letting Nit play with it, avoiding potential security [issues](https://www.owasp.org/index.php/Canonicalization,_locale_and_Unicode)

The codec architecture can also be used later to handle different codings for source files (that or we decide that all that is not UTF-8 is to be rejected/misinterpreted) or text.

Pull-Request: #1628
Reviewed-by: Jean Privat <jean@pryen.org>
Reviewed-by: Alexandre Terrasa <alexandre@moz-code.org>

1  2 
lib/standard/stream.nit

diff --combined lib/standard/stream.nit
@@@ -14,6 -14,7 +14,7 @@@ module strea
  intrude import text::ropes
  import error
  intrude import bytes
+ import codecs
  
  in "C" `{
        #include <unistd.h>
@@@ -43,6 -44,10 +44,10 @@@ en
  # A `Stream` that can be read from
  abstract class Reader
        super Stream
+       # Decoder used to transform input bytes to UTF-8
+       var decoder: Decoder = utf8_decoder is writable
        # Reads a character. Returns `null` on EOF or timeout
        fun read_char: nullable Char is abstract
  
        # ~~~
        fun read_all: String do
                var s = read_all_bytes
+               if not s.is_utf8 then s = s.clean_utf8
                var slen = s.length
                if slen == 0 then return ""
                var rets = ""
@@@ -378,6 -384,9 +384,9 @@@ en
  abstract class Writer
        super Stream
  
+       # The coder from a nit UTF-8 String to the output file
+       var coder: Coder = utf8_coder is writable
        # Writes bytes from `s`
        fun write_bytes(s: Bytes) is abstract
  
@@@ -448,6 -457,7 +457,7 @@@ abstract class BufferedReade
                return c
        end
  
+       # Resets the internal buffer
        fun buffer_reset do
                _buffer_length = 0
                _buffer_pos = 0
        do
                if last_error != null then return new Bytes.empty
                var s = new Bytes.with_capacity(10)
 +              var b = _buffer
                while not eof do
                        var j = _buffer_pos
                        var k = _buffer_length
 -                      while j < k do
 -                              s.add(_buffer[j])
 -                              j += 1
 -                      end
 -                      _buffer_pos = j
 +                      var rd_sz = k - j
 +                      s.append_ns_from(b, rd_sz, j)
 +                      _buffer_pos = k
                        fill_buffer
                end
                return s
  
        redef fun append_line_to(s)
        do
+               var lb = new Bytes.with_capacity(10)
                loop
                        # First phase: look for a '\n'
                        var i = _buffer_pos
  
                        # if there is something to append
                        if i > _buffer_pos then
                                # Copy from the buffer to the string
                                var j = _buffer_pos
                                while j < i do
-                                       s.bytes.add(_buffer[j])
+                                       lb.add(_buffer[j])
                                        j += 1
                                end
                                _buffer_pos = i
                        else
                                assert end_reached
+                               s.append lb.to_s
                                return
                        end
  
                        if eol then
                                # so \n is found
+                               s.append lb.to_s
                                return
                        else
                                # so \n is not found
-                               if end_reached then return
+                               if end_reached then
+                                       s.append lb.to_s
+                                       return
+                               end
                                fill_buffer
                        end
                end