lib/standard: Added services on NativeString and Char for the support of UTF-8
authorLucas Bajolet <r4pass@hotmail.com>
Fri, 10 Jul 2015 20:15:03 +0000 (16:15 -0400)
committerLucas Bajolet <r4pass@hotmail.com>
Thu, 23 Jul 2015 15:17:32 +0000 (11:17 -0400)
Signed-off-by: Lucas Bajolet <r4pass@hotmail.com>

lib/standard/kernel.nit
lib/standard/text/abstract_text.nit
lib/standard/text/flat.nit
lib/standard/text/native.nit

index e05d152..c7f0751 100644 (file)
@@ -719,10 +719,23 @@ universal Char
        redef type OTHER: Char
 
        redef fun object_id is intern
+       redef fun output `{
+               if(self < 128){
+                       printf("%c", self);
+               }else if(self < 2048){
+                       printf("%c%c", 0xC0 | ((0x7C0 & self) >> 6), 0x80 | (0x3F & self));
+               }else if(self < 65536){
+                       printf("%c%c%c", 0xE0 | ((0xF000 & self) >> 12), 0x80 | ((0xFC0 & self) >> 6) ,0x80 | (0x3F & self));
+               }else if(self < 2097152){
+                       printf("%c%c%c%c", 0xF0 | ((0x1C0000 & self) >> 18), 0x80 | ((0x3F000 & self) >> 12), 0x80 | ((0xFC0 & self) >> 6), 0x80 | (0x3F & self));
+               }else{
+                       // Bad char
+                       printf("%c", self);
+               }
+       `}
        redef fun hash do return ascii
        redef fun ==(o) is intern
        redef fun !=(o) is intern
-       redef fun output is intern
 
        redef fun <=(i) is intern
        redef fun <(i) is intern
index af45dab..cad7522 100644 (file)
@@ -1524,14 +1524,50 @@ redef class Float
 end
 
 redef class Char
+
+       # Length of `self` in a UTF-8 String
+       private fun u8char_len: Int do
+               var c = self.ascii
+               if c < 0x80 then return 1
+               if c <= 0x7FF then return 2
+               if c <= 0xFFFF then return 3
+               if c <= 0x10FFFF then return 4
+               # Bad character format
+               return 1
+       end
+
        #     assert 'x'.to_s    == "x"
-       redef fun to_s
-       do
-               var s = new Buffer.with_cap(1)
-               s.chars[0] = self
-               return s.to_s
+       redef fun to_s do
+               var ln = u8char_len
+               var ns = new NativeString(ln + 1)
+               u8char_tos(ns, ln)
+               return ns.to_s_with_length(ln)
        end
 
+       private fun u8char_tos(r: NativeString, len: Int) `{
+               r[len] = '\0';
+               switch(len){
+                       case 1:
+                               r[0] = self;
+                               break;
+                       case 2:
+                               r[0] = 0xC0 | ((self & 0x7C0) >> 6);
+                               r[1] = 0x80 | (self & 0x3F);
+                               break;
+                       case 3:
+                               r[0] = 0xE0 | ((self & 0xF000) >> 12);
+                               r[1] = 0x80 | ((self & 0xFC0) >> 6);
+                               r[2] = 0x80 | (self & 0x3F);
+                               break;
+                       case 4:
+                               r[0] = 0xF0 | ((self & 0x1C0000) >> 18);
+                               r[1] = 0x80 | ((self & 0x3F000) >> 12);
+                               r[2] = 0x80 | ((self & 0xFC0) >> 6);
+                               r[3] = 0x80 | (self & 0x3F);
+                               break;
+               }
+       `}
+
        # Returns true if the char is a numerical digit
        #
        #     assert '0'.is_numeric
index 04fabc9..66e7734 100644 (file)
@@ -869,6 +869,38 @@ redef class NativeString
                str.real_items = new_self
                return str
        end
+
+       # Sets the next bytes at position `pos` to the value of `c`, encoded in UTF-8
+       #
+       # Very unsafe, make sure to have room for this char prior to calling this function.
+       private fun set_char_at(pos: Int, c: Char) do
+               var ln = c.u8char_len
+               native_set_char(pos, c, ln)
+       end
+
+       private fun native_set_char(pos: Int, c: Char, ln: Int) `{
+               char* dst = self + pos;
+               switch(ln){
+                       case 1:
+                               dst[0] = c;
+                               break;
+                       case 2:
+                               dst[0] = 0xC0 | ((c & 0x7C0) >> 6);
+                               dst[1] = 0x80 | (c & 0x3F);
+                               break;
+                       case 3:
+                               dst[0] = 0xE0 | ((c & 0xF000) >> 12);
+                               dst[1] = 0x80 | ((c & 0xFC0) >> 6);
+                               dst[2] = 0x80 | (c & 0x3F);
+                               break;
+                       case 4:
+                               dst[0] = 0xF0 | ((c & 0x1C0000) >> 18);
+                               dst[1] = 0x80 | ((c & 0x3F000) >> 12);
+                               dst[2] = 0x80 | ((c & 0xFC0) >> 6);
+                               dst[3] = 0x80 | (c & 0x3F);
+                               break;
+               }
+       `}
 end
 
 redef class Int
index 8b66384..b2789a0 100644 (file)
 module native
 
 import kernel
+import math
+
+redef class Byte
+       # Gives the length of the UTF-8 char starting with `self`
+       private fun u8len: Int do
+               if self & 0b1000_0000u8 == 0u8 then
+                       return 1
+               else if self & 0b1110_0000u8 == 0b1100_0000u8 then
+                       return 2
+               else if self & 0b1111_0000u8 == 0b1110_0000u8 then
+                       return 3
+               else if self & 0b1111_1000u8 == 0b1111_0000u8 then
+                       return 4
+               else
+                       return 1
+               end
+       end
+end
 
 # Native strings are simple C char *
 extern class NativeString `{ char* `}
@@ -46,4 +64,90 @@ extern class NativeString `{ char* `}
 
        # Parse `self` as a Float.
        fun atof: Float `{ return atof(self); `}
+
+       # Gets the UTF-8 char at index `pos`
+       #
+       # Index is expressed in Unicode chars
+       #
+       # ~~~raw
+       #     assert "かきく".as(FlatString).items.char_at(0) == 'か'
+       # ~~~
+       #
+       # If the char at position pos is an invalid Unicode char,
+       # the Unicode replacement character � (0xFFFD) will be used.
+       #
+       # ~~~raw
+       #     assert "かきく".as(FlatString).items.char_at(1) == '�'
+       # ~~~
+       fun char_at(pos: Int): Char `{
+               char c = self[pos];
+               if((c & 0x80) == 0x00) return (uint32_t)c;
+               if(((c & 0xE0) == 0xC0) && ((self[pos + 1] & 0xC0) == 0x80)) return ((((uint32_t)c) & 0x1F) << 6) + ((((uint32_t)self[pos + 1] & 0x3F)));
+               if(((c & 0xF0) == 0xE0) && ((self[pos + 1] & 0xC0) == 0x80) && ((self[pos + 2] & 0xC0) == 0x80)) return ((((uint32_t)c) & 0xF) << 12) + ((((uint32_t)self[pos + 1]) & 0x3F) << 6) + ((((uint32_t)self[pos + 2] & 0x3F)));
+               if(((c & 0xF7) == 0xF0) && ((self[pos + 1] & 0xC0) == 0x80) && ((self[pos + 2] & 0xC0) == 0x80) && ((self[pos + 3] & 0xC0) == 0x80)) return ((((uint32_t)c) & 0x7) << 18) + ((((uint32_t)self[pos + 1]) & 0x3F) << 12) + ((((uint32_t)self[pos + 2]) & 0x3F) << 6) + ((((uint32_t)self[pos + 3] & 0x3F)));
+               return 0xFFFD;
+       `}
+
+       # Gets the byte index of char at position `n` in UTF-8 String
+       fun char_to_byte_index(n: Int): Int do return char_to_byte_index_cached(n, 0, 0)
+
+       # Gets the length of the character at position `pos` (1 if invalid sequence)
+       fun length_of_char_at(pos: Int): Int do
+               var c = self[pos]
+               if c & 0x80u8 == 0x00u8 then
+                       return 1
+               else if c & 0xE0u8 == 0xC0u8 and self[pos + 1] & 0xC0u8 == 0x80u8 then
+                       return 2
+               else if c & 0xF0u8 == 0xE0u8 and self[pos + 1] & 0xC0u8 == 0x80u8 and self[pos + 2] & 0xC0u8 == 0x80u8 then
+                       return 3
+               else if c & 0xF7u8 == 0xF0u8 and self[pos + 1] & 0xC0u8 == 0x80u8 and self[pos + 2] & 0xC0u8 == 0x80u8 and self[pos + 3] & 0xC0u8 == 0x80u8 then
+                       return 4
+               else
+                       return 1
+               end
+       end
+
+       # Gets the byte index of char at position `n` in UTF-8 String
+       #
+       # `char_from` and `byte_from` are cached values to seek from.
+       #
+       # NOTE: char_from and byte_from are not guaranteed to be valid cache values
+       # It it up to the client to ensure the validity of the information
+       fun char_to_byte_index_cached(n, char_from, byte_from: Int): Int do
+               var ns_i = byte_from
+               var my_i = char_from
+
+               while my_i < n do
+                       ns_i += length_of_char_at(ns_i)
+                       my_i += 1
+               end
+
+               while my_i > n do
+                       ns_i = find_beginning_of_char_at(ns_i - 1)
+                       my_i -= 1
+               end
+
+               return ns_i
+       end
+
+       # Returns the beginning position of the char at position `pos`
+       #
+       # If the char is invalid UTF-8, `pos` is returned as-is
+       #
+       # ~~~raw
+       #       assert "abc".items.find_beginning_of_char_at(2) == 2
+       #       assert "か".items.find_beginning_of_char_at(1) == 0
+       #       assert [0x41u8, 233u8].to_s.items.find_beginning_of_char_at(1) == 1
+       # ~~~
+       fun find_beginning_of_char_at(pos: Int): Int do
+               var endpos = pos
+               var c = self[pos]
+               while c & 0xC0u8 == 0x80u8 do
+                       pos -= 1
+                       c = self[pos]
+               end
+               var stpos = pos
+               if length_of_char_at(stpos) >= (endpos - stpos + 1) then return pos
+               return endpos
+       end
 end