lib/core: Improve speed of an indexed access in a UTF-8 `Text` entity
[nit.git] / lib / core / text / native.nit
index d370ee8..fad3ae1 100644 (file)
@@ -112,14 +112,34 @@ extern class NativeString `{ char* `}
        # ~~~raw
        #     assert "かきく".as(FlatString).items.char_at(1) == '�'
        # ~~~
-       fun char_at(pos: Int): Char `{
-               char c = self[pos];
-               if((c & 0x80) == 0x00) return (uint32_t)c;
-               if(((c & 0xE0) == 0xC0) && ((self[pos + 1] & 0xC0) == 0x80)) return ((((uint32_t)c) & 0x1F) << 6) + ((((uint32_t)self[pos + 1] & 0x3F)));
-               if(((c & 0xF0) == 0xE0) && ((self[pos + 1] & 0xC0) == 0x80) && ((self[pos + 2] & 0xC0) == 0x80)) return ((((uint32_t)c) & 0xF) << 12) + ((((uint32_t)self[pos + 1]) & 0x3F) << 6) + ((((uint32_t)self[pos + 2] & 0x3F)));
-               if(((c & 0xF8) == 0xF0) && ((self[pos + 1] & 0xC0) == 0x80) && ((self[pos + 2] & 0xC0) == 0x80) && ((self[pos + 3] & 0xC0) == 0x80)) return ((((uint32_t)c) & 0x7) << 18) + ((((uint32_t)self[pos + 1]) & 0x3F) << 12) + ((((uint32_t)self[pos + 2]) & 0x3F) << 6) + ((((uint32_t)self[pos + 3] & 0x3F)));
-               return 0xFFFD;
-       `}
+       fun char_at(pos: Int): Char do
+               var c = self[pos]
+               if c & 0x80u8 == 0u8 then return c.ascii
+               var b = fetch_4_hchars(pos)
+               var ret = 0
+               if b & 0xC00000 != 0x800000 then return 0xFFFD.code_point
+               if b & 0xE0000000 == 0xC0000000 then
+                       ret |= (b & 0x1F000000) >> 18
+                       ret |= (b & 0x3F0000) >> 16
+                       return ret.code_point
+               end
+               if not b & 0xC000 == 0x8000 then return 0xFFFD.code_point
+               if b & 0xF0000000 == 0xE0000000 then
+                       ret |= (b & 0xF000000) >> 12
+                       ret |= (b & 0x3F0000) >> 10
+                       ret |= (b & 0x3F00) >> 8
+                       return ret.code_point
+               end
+               if not b & 0xC0 == 0x80 then return 0xFFFD.code_point
+               if b & 0xF8000000 == 0xF0000000 then
+                       ret |= (b.to_i & 0x7000000) >> 6
+                       ret |= (b.to_i & 0x3F0000) >> 4
+                       ret |= (b.to_i & 0x3F00) >> 2
+                       ret |= b.to_i & 0x3F
+                       return ret.code_point
+               end
+               return 0xFFFD.code_point
+       end
 
        # Gets the byte index of char at position `n` in UTF-8 String
        fun char_to_byte_index(n: Int): Int do return char_to_byte_index_cached(n, 0, 0)
@@ -150,14 +170,34 @@ extern class NativeString `{ char* `}
                var ns_i = byte_from
                var my_i = char_from
 
-               while my_i < n do
+               var dist = n - my_i
+
+               while dist > 0 do
+                       while dist >= 4 do
+                               var i = fetch_4_chars(ns_i)
+                               if i & 0x80808080 != 0 then break
+                               ns_i += 4
+                               my_i += 4
+                               dist -= 4
+                       end
+                       if dist == 0 then break
                        ns_i += length_of_char_at(ns_i)
                        my_i += 1
+                       dist -= 1
                end
 
-               while my_i > n do
+               while dist < 0 do
+                       while dist <= -4 do
+                               var i = fetch_4_chars(ns_i - 4)
+                               if i & 0x80808080 != 0 then break
+                               ns_i -= 4
+                               my_i -= 4
+                               dist += 4
+                       end
+                       if dist == 0 then break
                        ns_i = find_beginning_of_char_at(ns_i - 1)
                        my_i -= 1
+                       dist += 1
                end
 
                return ns_i