From 3f4025d72cefa976c2cac241a687f173202dc219 Mon Sep 17 00:00:00 2001 From: Lucas Bajolet Date: Fri, 10 Jul 2015 16:15:03 -0400 Subject: [PATCH] lib/standard: Added services on NativeString and Char for the support of UTF-8 Signed-off-by: Lucas Bajolet --- lib/standard/kernel.nit | 15 ++++- lib/standard/text/abstract_text.nit | 46 ++++++++++++++-- lib/standard/text/flat.nit | 32 +++++++++++ lib/standard/text/native.nit | 104 +++++++++++++++++++++++++++++++++++ 4 files changed, 191 insertions(+), 6 deletions(-) diff --git a/lib/standard/kernel.nit b/lib/standard/kernel.nit index e05d152..c7f0751 100644 --- a/lib/standard/kernel.nit +++ b/lib/standard/kernel.nit @@ -719,10 +719,23 @@ universal Char redef type OTHER: Char redef fun object_id is intern + redef fun output `{ + if(self < 128){ + printf("%c", self); + }else if(self < 2048){ + printf("%c%c", 0xC0 | ((0x7C0 & self) >> 6), 0x80 | (0x3F & self)); + }else if(self < 65536){ + printf("%c%c%c", 0xE0 | ((0xF000 & self) >> 12), 0x80 | ((0xFC0 & self) >> 6) ,0x80 | (0x3F & self)); + }else if(self < 2097152){ + printf("%c%c%c%c", 0xF0 | ((0x1C0000 & self) >> 18), 0x80 | ((0x3F000 & self) >> 12), 0x80 | ((0xFC0 & self) >> 6), 0x80 | (0x3F & self)); + }else{ + // Bad char + printf("%c", self); + } + `} redef fun hash do return ascii redef fun ==(o) is intern redef fun !=(o) is intern - redef fun output is intern redef fun <=(i) is intern redef fun <(i) is intern diff --git a/lib/standard/text/abstract_text.nit b/lib/standard/text/abstract_text.nit index af45dab..cad7522 100644 --- a/lib/standard/text/abstract_text.nit +++ b/lib/standard/text/abstract_text.nit @@ -1524,14 +1524,50 @@ redef class Float end redef class Char + + # Length of `self` in a UTF-8 String + private fun u8char_len: Int do + var c = self.ascii + if c < 0x80 then return 1 + if c <= 0x7FF then return 2 + if c <= 0xFFFF then return 3 + if c <= 0x10FFFF then return 4 + # Bad character format + return 1 + end + # assert 'x'.to_s == "x" - redef fun to_s - do - var s = new Buffer.with_cap(1) - s.chars[0] = self - return s.to_s + redef fun to_s do + var ln = u8char_len + var ns = new NativeString(ln + 1) + u8char_tos(ns, ln) + return ns.to_s_with_length(ln) end + private fun u8char_tos(r: NativeString, len: Int) `{ + r[len] = '\0'; + switch(len){ + case 1: + r[0] = self; + break; + case 2: + r[0] = 0xC0 | ((self & 0x7C0) >> 6); + r[1] = 0x80 | (self & 0x3F); + break; + case 3: + r[0] = 0xE0 | ((self & 0xF000) >> 12); + r[1] = 0x80 | ((self & 0xFC0) >> 6); + r[2] = 0x80 | (self & 0x3F); + break; + case 4: + r[0] = 0xF0 | ((self & 0x1C0000) >> 18); + r[1] = 0x80 | ((self & 0x3F000) >> 12); + r[2] = 0x80 | ((self & 0xFC0) >> 6); + r[3] = 0x80 | (self & 0x3F); + break; + } + `} + # Returns true if the char is a numerical digit # # assert '0'.is_numeric diff --git a/lib/standard/text/flat.nit b/lib/standard/text/flat.nit index 04fabc9..66e7734 100644 --- a/lib/standard/text/flat.nit +++ b/lib/standard/text/flat.nit @@ -869,6 +869,38 @@ redef class NativeString str.real_items = new_self return str end + + # Sets the next bytes at position `pos` to the value of `c`, encoded in UTF-8 + # + # Very unsafe, make sure to have room for this char prior to calling this function. + private fun set_char_at(pos: Int, c: Char) do + var ln = c.u8char_len + native_set_char(pos, c, ln) + end + + private fun native_set_char(pos: Int, c: Char, ln: Int) `{ + char* dst = self + pos; + switch(ln){ + case 1: + dst[0] = c; + break; + case 2: + dst[0] = 0xC0 | ((c & 0x7C0) >> 6); + dst[1] = 0x80 | (c & 0x3F); + break; + case 3: + dst[0] = 0xE0 | ((c & 0xF000) >> 12); + dst[1] = 0x80 | ((c & 0xFC0) >> 6); + dst[2] = 0x80 | (c & 0x3F); + break; + case 4: + dst[0] = 0xF0 | ((c & 0x1C0000) >> 18); + dst[1] = 0x80 | ((c & 0x3F000) >> 12); + dst[2] = 0x80 | ((c & 0xFC0) >> 6); + dst[3] = 0x80 | (c & 0x3F); + break; + } + `} end redef class Int diff --git a/lib/standard/text/native.nit b/lib/standard/text/native.nit index 8b66384..b2789a0 100644 --- a/lib/standard/text/native.nit +++ b/lib/standard/text/native.nit @@ -12,6 +12,24 @@ module native import kernel +import math + +redef class Byte + # Gives the length of the UTF-8 char starting with `self` + private fun u8len: Int do + if self & 0b1000_0000u8 == 0u8 then + return 1 + else if self & 0b1110_0000u8 == 0b1100_0000u8 then + return 2 + else if self & 0b1111_0000u8 == 0b1110_0000u8 then + return 3 + else if self & 0b1111_1000u8 == 0b1111_0000u8 then + return 4 + else + return 1 + end + end +end # Native strings are simple C char * extern class NativeString `{ char* `} @@ -46,4 +64,90 @@ extern class NativeString `{ char* `} # Parse `self` as a Float. fun atof: Float `{ return atof(self); `} + + # Gets the UTF-8 char at index `pos` + # + # Index is expressed in Unicode chars + # + # ~~~raw + # assert "かきく".as(FlatString).items.char_at(0) == 'か' + # ~~~ + # + # If the char at position pos is an invalid Unicode char, + # the Unicode replacement character � (0xFFFD) will be used. + # + # ~~~raw + # assert "かきく".as(FlatString).items.char_at(1) == '�' + # ~~~ + fun char_at(pos: Int): Char `{ + char c = self[pos]; + if((c & 0x80) == 0x00) return (uint32_t)c; + if(((c & 0xE0) == 0xC0) && ((self[pos + 1] & 0xC0) == 0x80)) return ((((uint32_t)c) & 0x1F) << 6) + ((((uint32_t)self[pos + 1] & 0x3F))); + if(((c & 0xF0) == 0xE0) && ((self[pos + 1] & 0xC0) == 0x80) && ((self[pos + 2] & 0xC0) == 0x80)) return ((((uint32_t)c) & 0xF) << 12) + ((((uint32_t)self[pos + 1]) & 0x3F) << 6) + ((((uint32_t)self[pos + 2] & 0x3F))); + if(((c & 0xF7) == 0xF0) && ((self[pos + 1] & 0xC0) == 0x80) && ((self[pos + 2] & 0xC0) == 0x80) && ((self[pos + 3] & 0xC0) == 0x80)) return ((((uint32_t)c) & 0x7) << 18) + ((((uint32_t)self[pos + 1]) & 0x3F) << 12) + ((((uint32_t)self[pos + 2]) & 0x3F) << 6) + ((((uint32_t)self[pos + 3] & 0x3F))); + return 0xFFFD; + `} + + # Gets the byte index of char at position `n` in UTF-8 String + fun char_to_byte_index(n: Int): Int do return char_to_byte_index_cached(n, 0, 0) + + # Gets the length of the character at position `pos` (1 if invalid sequence) + fun length_of_char_at(pos: Int): Int do + var c = self[pos] + if c & 0x80u8 == 0x00u8 then + return 1 + else if c & 0xE0u8 == 0xC0u8 and self[pos + 1] & 0xC0u8 == 0x80u8 then + return 2 + else if c & 0xF0u8 == 0xE0u8 and self[pos + 1] & 0xC0u8 == 0x80u8 and self[pos + 2] & 0xC0u8 == 0x80u8 then + return 3 + else if c & 0xF7u8 == 0xF0u8 and self[pos + 1] & 0xC0u8 == 0x80u8 and self[pos + 2] & 0xC0u8 == 0x80u8 and self[pos + 3] & 0xC0u8 == 0x80u8 then + return 4 + else + return 1 + end + end + + # Gets the byte index of char at position `n` in UTF-8 String + # + # `char_from` and `byte_from` are cached values to seek from. + # + # NOTE: char_from and byte_from are not guaranteed to be valid cache values + # It it up to the client to ensure the validity of the information + fun char_to_byte_index_cached(n, char_from, byte_from: Int): Int do + var ns_i = byte_from + var my_i = char_from + + while my_i < n do + ns_i += length_of_char_at(ns_i) + my_i += 1 + end + + while my_i > n do + ns_i = find_beginning_of_char_at(ns_i - 1) + my_i -= 1 + end + + return ns_i + end + + # Returns the beginning position of the char at position `pos` + # + # If the char is invalid UTF-8, `pos` is returned as-is + # + # ~~~raw + # assert "abc".items.find_beginning_of_char_at(2) == 2 + # assert "か".items.find_beginning_of_char_at(1) == 0 + # assert [0x41u8, 233u8].to_s.items.find_beginning_of_char_at(1) == 1 + # ~~~ + fun find_beginning_of_char_at(pos: Int): Int do + var endpos = pos + var c = self[pos] + while c & 0xC0u8 == 0x80u8 do + pos -= 1 + c = self[pos] + end + var stpos = pos + if length_of_char_at(stpos) >= (endpos - stpos + 1) then return pos + return endpos + end end -- 1.7.9.5