Merge: Intro Codec

author Jean Privat <jean@pryen.org>

Thu, 20 Aug 2015 14:45:44 +0000 (10:45 -0400)

committer Jean Privat <jean@pryen.org>

Thu, 20 Aug 2015 14:45:44 +0000 (10:45 -0400)
author Jean Privat <jean@pryen.org>
Thu, 20 Aug 2015 14:45:44 +0000 (10:45 -0400)
committer Jean Privat <jean@pryen.org>
Thu, 20 Aug 2015 14:45:44 +0000 (10:45 -0400)
diff --git a/lib/standard/bytes.nit b/lib/standard/bytes.nit

index b8cc1fb..59c4c5f 100644 (file)
--- a/lib/standard/bytes.nit
+++ b/lib/standard/bytes.nit
@@ -145,10 +145,81 @@ class Bytes
  
         redef fun to_s do
                 persisted = true
-               return new FlatString.with_infos(items, length, 0, length -1)
+               var b = self
+               if not is_utf8 then
+                       b = clean_utf8
+                       persisted = false
+               end
+               return new FlatString.with_infos(b.items, b.length, 0, b.length -1)
         end
  
         redef fun iterator do return new BytesIterator.with_buffer(self)
+
+       # Is the byte collection valid UTF-8 ?
+       fun is_utf8: Bool do
+               var charst = once [0x80u8, 0u8, 0xE0u8, 0xC0u8, 0xF0u8, 0xE0u8, 0xF8u8, 0xF0u8]
+               var lobounds = once [0, 0x80, 0x800, 0x10000]
+               var hibounds = once [0x7F, 0x7FF, 0xFFFF, 0x10FFFF]
+               var pos = 0
+               var len = length
+               var mits = items
+               while pos < len do
+                       var nxst = mits.length_of_char_at(pos)
+                       var charst_index = (nxst - 1) * 2
+                       if mits[pos] & charst[charst_index] == charst[charst_index + 1] then
+                               var c = mits.char_at(pos)
+                               var cp = c.ascii
+                               if cp <= hibounds[nxst - 1] and cp >= lobounds[nxst - 1] then
+                                       if cp >= 0xD800 and cp <= 0xDFFF or
+                                          cp == 0xFFFE or cp == 0xFFFF then return false
+                               else
+                                       return false
+                               end
+                       else
+                               return false
+                       end
+                       pos += nxst
+               end
+               return true
+       end
+
+       # Cleans the bytes of `self` to be UTF-8 compliant
+       private fun clean_utf8: Bytes do
+               var charst = once [0x80u8, 0u8, 0xE0u8, 0xC0u8, 0xF0u8, 0xE0u8, 0xF8u8, 0xF0u8]
+               var badchar = once [0xEFu8, 0xBFu8, 0xBDu8]
+               var lobounds = once [0, 0x80, 0x800, 0x10000]
+               var hibounds = once [0x7F, 0x7FF, 0xFFFF, 0x10FFFF]
+               var pos = 0
+               var len = length
+               var ret = new Bytes.with_capacity(len)
+               var mits = items
+               while pos < len do
+                       var nxst = mits.length_of_char_at(pos)
+                       var charst_index = (nxst - 1) * 2
+                       if mits[pos] & charst[charst_index] == charst[charst_index + 1] then
+                               var c = mits.char_at(pos)
+                               var cp = c.ascii
+                               if cp <= hibounds[nxst - 1] and cp >= lobounds[nxst - 1] then
+                                       if cp >= 0xD800 and cp <= 0xDFFF or
+                                          cp == 0xFFFE or cp == 0xFFFF then
+                                               ret.append badchar
+                                               pos += 1
+                                       else
+                                               var pend = pos + nxst
+                                               for i in [pos .. pend[ do ret.add mits[i]
+                                               pos += nxst
+                                       end
+                               else
+                                       ret.append badchar
+                                       pos += 1
+                               end
+                       else
+                               ret.append badchar
+                               pos += 1
+                       end
+               end
+               return ret
+       end
  end
  
  private class BytesIterator
@@ -178,21 +249,23 @@ redef class Text
         # ~~~
         fun to_bytes: Bytes do
                 var b = new Bytes.with_capacity(bytelen)
+               append_to_bytes b
+               return b
+       end
+
+       # Appends `self.bytes` to `b`
+       fun append_to_bytes(b: Bytes) do
                 for s in substrings do
                         var from = if s isa FlatString then s.first_byte else 0
                         b.append_ns_from(s.items, s.bytelen, from)
                 end
-               return b
         end
  end
  
  redef class FlatText
-       redef fun to_bytes do
-               var len = bytelen
-               var b = new Bytes.with_capacity(len)
+       redef fun append_to_bytes(b) do
                 var from = if self isa FlatString then first_byte else 0
-               b.append_ns_from(items, len, from)
-               return b
+               b.append_ns_from(items, bytelen, from)
         end
  end
  
diff --git a/lib/standard/codecs/codec_base.nit b/lib/standard/codecs/codec_base.nit

new file mode 100644 (file)

index 0000000..b4a9523
--- /dev/null
+++ b/lib/standard/codecs/codec_base.nit
@@ -0,0 +1,51 @@
+# This file is part of NIT (http://www.nitlanguage.org).
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Base for codecs to use with streams
+#
+# A Codec (Coder/Decoder) is a tranformer from a byte-format to another
+#
+# As Nit Strings are UTF-8, a codec works as :
+# - Coder: From a UTF-8 string to a specified format (writing)
+# - Decoder: From a specified format to a UTF-8 string (reading)
+module codec_base
+
+import text
+import bytes
+
+# Codes UTF-8 entities to an external format
+abstract class Coder
+
+       # Transforms `c` to its representation in the format of `self`
+       fun code_char(c: Char): Bytes is abstract
+
+       # Adds a char `c` to bytes `s`
+       fun add_char_to(c: Char, s: Bytes) is abstract
+
+       # Transforms `s` to the format of `self`
+       fun code_string(s: Text): Bytes is abstract
+
+       # Adds a string `s` to bytes `b`
+       fun add_string_to(s: Text, b: Bytes) is abstract
+end
+
+# Decodes entities in an external format to UTF-8
+abstract class Decoder
+
+       # Decodes a char from `b` to a Unicode code-point
+       fun decode_char(b: Bytes): Char is abstract
+
+       # Decodes a string `b` to UTF-8
+       fun decode_string(b: Bytes): String is abstract
+end
diff --git a/lib/standard/codecs/codecs.nit b/lib/standard/codecs/codecs.nit

new file mode 100644 (file)

index 0000000..25e9931
--- /dev/null
+++ b/lib/standard/codecs/codecs.nit
@@ -0,0 +1,19 @@
+# This file is part of NIT (http://www.nitlanguage.org).
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Group module for all codec-related manipulations
+module codecs
+
+import codec_base
+import utf8
diff --git a/lib/standard/codecs/utf8.nit b/lib/standard/codecs/utf8.nit

new file mode 100644 (file)

index 0000000..65f2fc9
--- /dev/null
+++ b/lib/standard/codecs/utf8.nit
@@ -0,0 +1,50 @@
+# This file is part of NIT (http://www.nitlanguage.org).
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Codec for UTF-8 I/O
+module utf8
+
+import codec_base
+
+# Returns UTF-8 entities as-is
+private class UTF8Coder
+       super Coder
+
+       redef fun code_char(c) do return c.to_s.to_bytes
+
+       redef fun add_char_to(c, stream) do c.to_s.append_to_bytes(stream)
+
+       redef fun code_string(s) do return s.to_bytes
+
+       redef fun add_string_to(s, b) do s.append_to_bytes(b)
+end
+
+# Decodes entities in an external format to UTF-8
+private class UTF8Decoder
+       super Decoder
+
+       redef fun decode_char(b) do
+               var s = b.to_s
+               return s[0]
+       end
+
+       redef fun decode_string(b) do
+               return b.to_s
+       end
+end
+
+# Returns the instance of a UTF-8 Coder
+fun utf8_coder: Coder do return once new UTF8Coder
+# Returns the instance of a UTF-8 Decoder
+fun utf8_decoder: Decoder do return once new UTF8Decoder
diff --git a/lib/standard/stream.nit b/lib/standard/stream.nit

index 0eabcba..2db319a 100644 (file)
--- a/lib/standard/stream.nit
+++ b/lib/standard/stream.nit
@@ -14,6 +14,7 @@ module stream
  intrude import text::ropes
  import error
  intrude import bytes
+import codecs
  
  in "C" `{
         #include <unistd.h>
@@ -43,6 +44,10 @@ end
  # A `Stream` that can be read from
  abstract class Reader
         super Stream
+
+       # Decoder used to transform input bytes to UTF-8
+       var decoder: Decoder = utf8_decoder is writable
+
         # Reads a character. Returns `null` on EOF or timeout
         fun read_char: nullable Char is abstract
  
@@ -168,6 +173,7 @@ abstract class Reader
         # ~~~
         fun read_all: String do
                 var s = read_all_bytes
+               if not s.is_utf8 then s = s.clean_utf8
                 var slen = s.length
                 if slen == 0 then return ""
                 var rets = ""
@@ -378,6 +384,9 @@ end
  abstract class Writer
         super Stream
  
+       # The coder from a nit UTF-8 String to the output file
+       var coder: Coder = utf8_coder is writable
+
         # Writes bytes from `s`
         fun write_bytes(s: Bytes) is abstract
  
@@ -448,6 +457,7 @@ abstract class BufferedReader
                 return c
         end
  
+       # Resets the internal buffer
         fun buffer_reset do
                 _buffer_length = 0
                 _buffer_pos = 0
@@ -532,6 +542,7 @@ abstract class BufferedReader
  
         redef fun append_line_to(s)
         do
+               var lb = new Bytes.with_capacity(10)
                 loop
                         # First phase: look for a '\n'
                         var i = _buffer_pos
@@ -550,27 +561,29 @@ abstract class BufferedReader
  
                         # if there is something to append
                         if i > _buffer_pos then
-                               # Enlarge the string (if needed)
-                               s.enlarge(s.bytelen + i - _buffer_pos)
-
                                 # Copy from the buffer to the string
                                 var j = _buffer_pos
                                 while j < i do
-                                       s.bytes.add(_buffer[j])
+                                       lb.add(_buffer[j])
                                         j += 1
                                 end
                                 _buffer_pos = i
                         else
                                 assert end_reached
+                               s.append lb.to_s
                                 return
                         end
  
                         if eol then
                                 # so \n is found
+                               s.append lb.to_s
                                 return
                         else
                                 # so \n is not found
-                               if end_reached then return
+                               if end_reached then
+                                       s.append lb.to_s
+                                       return
+                               end
                                 fill_buffer
                         end
                 end
diff --git a/tests/UTF-8-test.txt b/tests/UTF-8-test.txt

new file mode 100644 (file)

index 0000000..abd16f7

Binary files /dev/null and b/tests/UTF-8-test.txt differ
diff --git a/tests/sav/nitpick_args1.res b/tests/sav/nitpick_args1.res

index f9e2cbb..84ff5ce 100644 (file)
--- a/tests/sav/nitpick_args1.res
+++ b/tests/sav/nitpick_args1.res
@@ -1,4 +1,3 @@
-../lib/standard/stream.nit:451,6--17: Documentation warning: Undocumented property `buffer_reset`
  test_advice_repeated_types.nit:36,15--20: Warning: useless type repetition on redefined attribute `_a`
  test_advice_repeated_types.nit:37,18--20: Warning: useless type repetition on parameter `b1` for redefined method `b`
  test_advice_repeated_types.nit:38,18--20: Warning: useless type repetition on parameter `c1` for redefined method `c`
diff --git a/tests/sav/test_read_all.res b/tests/sav/test_read_all.res

new file mode 100644 (file)

index 0000000..2f4cc2a
--- /dev/null
+++ b/tests/sav/test_read_all.res
@@ -0,0 +1 @@
+usage ./test_read_all file
diff --git a/tests/sav/test_read_all_args1.res b/tests/sav/test_read_all_args1.res

new file mode 100644 (file)

index 0000000..6228c6e

Binary files /dev/null and b/tests/sav/test_read_all_args1.res differ
diff --git a/tests/test_read_all.args b/tests/test_read_all.args

new file mode 100644 (file)

index 0000000..e497886
--- /dev/null
+++ b/tests/test_read_all.args
@@ -0,0 +1 @@
+UTF-8-test.txt
diff --git a/tests/test_read_all.nit b/tests/test_read_all.nit

new file mode 100644 (file)

index 0000000..4f2d7e0
--- /dev/null
+++ b/tests/test_read_all.nit
@@ -0,0 +1,26 @@
+# This file is part of NIT ( http://www.nitlanguage.org ).
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if args.is_empty then
+       print "usage ./test_read_all file"
+       exit -1
+end
+
+var file = new FileReader.open(args[0])
+
+var s = file.read_all
+
+for i in s do
+       printn i
+end
author	Jean Privat <jean@pryen.org>
	Thu, 20 Aug 2015 14:45:44 +0000 (10:45 -0400)
committer	Jean Privat <jean@pryen.org>
	Thu, 20 Aug 2015 14:45:44 +0000 (10:45 -0400)
lib/standard/bytes.nit		patch \| blob \| history
lib/standard/codecs/codec_base.nit	[new file with mode: 0644]	patch \| blob
lib/standard/codecs/codecs.nit	[new file with mode: 0644]	patch \| blob
lib/standard/codecs/utf8.nit	[new file with mode: 0644]	patch \| blob
lib/standard/stream.nit		patch \| blob \| history
tests/UTF-8-test.txt	[new file with mode: 0644]	patch \| blob
tests/sav/nitpick_args1.res		patch \| blob \| history
tests/sav/test_read_all.res	[new file with mode: 0644]	patch \| blob
tests/sav/test_read_all_args1.res	[new file with mode: 0644]	patch \| blob
tests/test_read_all.args	[new file with mode: 0644]	patch \| blob
tests/test_read_all.nit	[new file with mode: 0644]	patch \| blob