Merge: Intro Codec
authorJean Privat <jean@pryen.org>
Thu, 20 Aug 2015 14:45:44 +0000 (10:45 -0400)
committerJean Privat <jean@pryen.org>
Thu, 20 Aug 2015 14:45:44 +0000 (10:45 -0400)
As UTF-8 is now part of Nit, the standard imposes conforming implementations to properly handle borderline cases like overlong sequences and such.

The codec defined here sanitizes an input before letting Nit play with it, avoiding potential security [issues](https://www.owasp.org/index.php/Canonicalization,_locale_and_Unicode)

The codec architecture can also be used later to handle different codings for source files (that or we decide that all that is not UTF-8 is to be rejected/misinterpreted) or text.

Pull-Request: #1628
Reviewed-by: Jean Privat <jean@pryen.org>
Reviewed-by: Alexandre Terrasa <alexandre@moz-code.org>

lib/standard/bytes.nit
lib/standard/codecs/codec_base.nit [new file with mode: 0644]
lib/standard/codecs/codecs.nit [new file with mode: 0644]
lib/standard/codecs/utf8.nit [new file with mode: 0644]
lib/standard/stream.nit
tests/UTF-8-test.txt [new file with mode: 0644]
tests/sav/nitpick_args1.res
tests/sav/test_read_all.res [new file with mode: 0644]
tests/sav/test_read_all_args1.res [new file with mode: 0644]
tests/test_read_all.args [new file with mode: 0644]
tests/test_read_all.nit [new file with mode: 0644]

index b8cc1fb..59c4c5f 100644 (file)
@@ -145,10 +145,81 @@ class Bytes
 
        redef fun to_s do
                persisted = true
-               return new FlatString.with_infos(items, length, 0, length -1)
+               var b = self
+               if not is_utf8 then
+                       b = clean_utf8
+                       persisted = false
+               end
+               return new FlatString.with_infos(b.items, b.length, 0, b.length -1)
        end
 
        redef fun iterator do return new BytesIterator.with_buffer(self)
+
+       # Is the byte collection valid UTF-8 ?
+       fun is_utf8: Bool do
+               var charst = once [0x80u8, 0u8, 0xE0u8, 0xC0u8, 0xF0u8, 0xE0u8, 0xF8u8, 0xF0u8]
+               var lobounds = once [0, 0x80, 0x800, 0x10000]
+               var hibounds = once [0x7F, 0x7FF, 0xFFFF, 0x10FFFF]
+               var pos = 0
+               var len = length
+               var mits = items
+               while pos < len do
+                       var nxst = mits.length_of_char_at(pos)
+                       var charst_index = (nxst - 1) * 2
+                       if mits[pos] & charst[charst_index] == charst[charst_index + 1] then
+                               var c = mits.char_at(pos)
+                               var cp = c.ascii
+                               if cp <= hibounds[nxst - 1] and cp >= lobounds[nxst - 1] then
+                                       if cp >= 0xD800 and cp <= 0xDFFF or
+                                          cp == 0xFFFE or cp == 0xFFFF then return false
+                               else
+                                       return false
+                               end
+                       else
+                               return false
+                       end
+                       pos += nxst
+               end
+               return true
+       end
+
+       # Cleans the bytes of `self` to be UTF-8 compliant
+       private fun clean_utf8: Bytes do
+               var charst = once [0x80u8, 0u8, 0xE0u8, 0xC0u8, 0xF0u8, 0xE0u8, 0xF8u8, 0xF0u8]
+               var badchar = once [0xEFu8, 0xBFu8, 0xBDu8]
+               var lobounds = once [0, 0x80, 0x800, 0x10000]
+               var hibounds = once [0x7F, 0x7FF, 0xFFFF, 0x10FFFF]
+               var pos = 0
+               var len = length
+               var ret = new Bytes.with_capacity(len)
+               var mits = items
+               while pos < len do
+                       var nxst = mits.length_of_char_at(pos)
+                       var charst_index = (nxst - 1) * 2
+                       if mits[pos] & charst[charst_index] == charst[charst_index + 1] then
+                               var c = mits.char_at(pos)
+                               var cp = c.ascii
+                               if cp <= hibounds[nxst - 1] and cp >= lobounds[nxst - 1] then
+                                       if cp >= 0xD800 and cp <= 0xDFFF or
+                                          cp == 0xFFFE or cp == 0xFFFF then
+                                               ret.append badchar
+                                               pos += 1
+                                       else
+                                               var pend = pos + nxst
+                                               for i in [pos .. pend[ do ret.add mits[i]
+                                               pos += nxst
+                                       end
+                               else
+                                       ret.append badchar
+                                       pos += 1
+                               end
+                       else
+                               ret.append badchar
+                               pos += 1
+                       end
+               end
+               return ret
+       end
 end
 
 private class BytesIterator
@@ -178,21 +249,23 @@ redef class Text
        # ~~~
        fun to_bytes: Bytes do
                var b = new Bytes.with_capacity(bytelen)
+               append_to_bytes b
+               return b
+       end
+
+       # Appends `self.bytes` to `b`
+       fun append_to_bytes(b: Bytes) do
                for s in substrings do
                        var from = if s isa FlatString then s.first_byte else 0
                        b.append_ns_from(s.items, s.bytelen, from)
                end
-               return b
        end
 end
 
 redef class FlatText
-       redef fun to_bytes do
-               var len = bytelen
-               var b = new Bytes.with_capacity(len)
+       redef fun append_to_bytes(b) do
                var from = if self isa FlatString then first_byte else 0
-               b.append_ns_from(items, len, from)
-               return b
+               b.append_ns_from(items, bytelen, from)
        end
 end
 
diff --git a/lib/standard/codecs/codec_base.nit b/lib/standard/codecs/codec_base.nit
new file mode 100644 (file)
index 0000000..b4a9523
--- /dev/null
@@ -0,0 +1,51 @@
+# This file is part of NIT (http://www.nitlanguage.org).
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Base for codecs to use with streams
+#
+# A Codec (Coder/Decoder) is a tranformer from a byte-format to another
+#
+# As Nit Strings are UTF-8, a codec works as :
+# - Coder: From a UTF-8 string to a specified format (writing)
+# - Decoder: From a specified format to a UTF-8 string (reading)
+module codec_base
+
+import text
+import bytes
+
+# Codes UTF-8 entities to an external format
+abstract class Coder
+
+       # Transforms `c` to its representation in the format of `self`
+       fun code_char(c: Char): Bytes is abstract
+
+       # Adds a char `c` to bytes `s`
+       fun add_char_to(c: Char, s: Bytes) is abstract
+
+       # Transforms `s` to the format of `self`
+       fun code_string(s: Text): Bytes is abstract
+
+       # Adds a string `s` to bytes `b`
+       fun add_string_to(s: Text, b: Bytes) is abstract
+end
+
+# Decodes entities in an external format to UTF-8
+abstract class Decoder
+
+       # Decodes a char from `b` to a Unicode code-point
+       fun decode_char(b: Bytes): Char is abstract
+
+       # Decodes a string `b` to UTF-8
+       fun decode_string(b: Bytes): String is abstract
+end
diff --git a/lib/standard/codecs/codecs.nit b/lib/standard/codecs/codecs.nit
new file mode 100644 (file)
index 0000000..25e9931
--- /dev/null
@@ -0,0 +1,19 @@
+# This file is part of NIT (http://www.nitlanguage.org).
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Group module for all codec-related manipulations
+module codecs
+
+import codec_base
+import utf8
diff --git a/lib/standard/codecs/utf8.nit b/lib/standard/codecs/utf8.nit
new file mode 100644 (file)
index 0000000..65f2fc9
--- /dev/null
@@ -0,0 +1,50 @@
+# This file is part of NIT (http://www.nitlanguage.org).
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Codec for UTF-8 I/O
+module utf8
+
+import codec_base
+
+# Returns UTF-8 entities as-is
+private class UTF8Coder
+       super Coder
+
+       redef fun code_char(c) do return c.to_s.to_bytes
+
+       redef fun add_char_to(c, stream) do c.to_s.append_to_bytes(stream)
+
+       redef fun code_string(s) do return s.to_bytes
+
+       redef fun add_string_to(s, b) do s.append_to_bytes(b)
+end
+
+# Decodes entities in an external format to UTF-8
+private class UTF8Decoder
+       super Decoder
+
+       redef fun decode_char(b) do
+               var s = b.to_s
+               return s[0]
+       end
+
+       redef fun decode_string(b) do
+               return b.to_s
+       end
+end
+
+# Returns the instance of a UTF-8 Coder
+fun utf8_coder: Coder do return once new UTF8Coder
+# Returns the instance of a UTF-8 Decoder
+fun utf8_decoder: Decoder do return once new UTF8Decoder
index 0eabcba..2db319a 100644 (file)
@@ -14,6 +14,7 @@ module stream
 intrude import text::ropes
 import error
 intrude import bytes
+import codecs
 
 in "C" `{
        #include <unistd.h>
@@ -43,6 +44,10 @@ end
 # A `Stream` that can be read from
 abstract class Reader
        super Stream
+
+       # Decoder used to transform input bytes to UTF-8
+       var decoder: Decoder = utf8_decoder is writable
+
        # Reads a character. Returns `null` on EOF or timeout
        fun read_char: nullable Char is abstract
 
@@ -168,6 +173,7 @@ abstract class Reader
        # ~~~
        fun read_all: String do
                var s = read_all_bytes
+               if not s.is_utf8 then s = s.clean_utf8
                var slen = s.length
                if slen == 0 then return ""
                var rets = ""
@@ -378,6 +384,9 @@ end
 abstract class Writer
        super Stream
 
+       # The coder from a nit UTF-8 String to the output file
+       var coder: Coder = utf8_coder is writable
+
        # Writes bytes from `s`
        fun write_bytes(s: Bytes) is abstract
 
@@ -448,6 +457,7 @@ abstract class BufferedReader
                return c
        end
 
+       # Resets the internal buffer
        fun buffer_reset do
                _buffer_length = 0
                _buffer_pos = 0
@@ -532,6 +542,7 @@ abstract class BufferedReader
 
        redef fun append_line_to(s)
        do
+               var lb = new Bytes.with_capacity(10)
                loop
                        # First phase: look for a '\n'
                        var i = _buffer_pos
@@ -550,27 +561,29 @@ abstract class BufferedReader
 
                        # if there is something to append
                        if i > _buffer_pos then
-                               # Enlarge the string (if needed)
-                               s.enlarge(s.bytelen + i - _buffer_pos)
-
                                # Copy from the buffer to the string
                                var j = _buffer_pos
                                while j < i do
-                                       s.bytes.add(_buffer[j])
+                                       lb.add(_buffer[j])
                                        j += 1
                                end
                                _buffer_pos = i
                        else
                                assert end_reached
+                               s.append lb.to_s
                                return
                        end
 
                        if eol then
                                # so \n is found
+                               s.append lb.to_s
                                return
                        else
                                # so \n is not found
-                               if end_reached then return
+                               if end_reached then
+                                       s.append lb.to_s
+                                       return
+                               end
                                fill_buffer
                        end
                end
diff --git a/tests/UTF-8-test.txt b/tests/UTF-8-test.txt
new file mode 100644 (file)
index 0000000..abd16f7
Binary files /dev/null and b/tests/UTF-8-test.txt differ
index f9e2cbb..84ff5ce 100644 (file)
@@ -1,4 +1,3 @@
-../lib/standard/stream.nit:451,6--17: Documentation warning: Undocumented property `buffer_reset`
 test_advice_repeated_types.nit:36,15--20: Warning: useless type repetition on redefined attribute `_a`
 test_advice_repeated_types.nit:37,18--20: Warning: useless type repetition on parameter `b1` for redefined method `b`
 test_advice_repeated_types.nit:38,18--20: Warning: useless type repetition on parameter `c1` for redefined method `c`
diff --git a/tests/sav/test_read_all.res b/tests/sav/test_read_all.res
new file mode 100644 (file)
index 0000000..2f4cc2a
--- /dev/null
@@ -0,0 +1 @@
+usage ./test_read_all file
diff --git a/tests/sav/test_read_all_args1.res b/tests/sav/test_read_all_args1.res
new file mode 100644 (file)
index 0000000..6228c6e
Binary files /dev/null and b/tests/sav/test_read_all_args1.res differ
diff --git a/tests/test_read_all.args b/tests/test_read_all.args
new file mode 100644 (file)
index 0000000..e497886
--- /dev/null
@@ -0,0 +1 @@
+UTF-8-test.txt
diff --git a/tests/test_read_all.nit b/tests/test_read_all.nit
new file mode 100644 (file)
index 0000000..4f2d7e0
--- /dev/null
@@ -0,0 +1,26 @@
+# This file is part of NIT ( http://www.nitlanguage.org ).
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if args.is_empty then
+       print "usage ./test_read_all file"
+       exit -1
+end
+
+var file = new FileReader.open(args[0])
+
+var s = file.read_all
+
+for i in s do
+       printn i
+end