lib/core: Improve speed of an indexed access in a UTF-8 `Text` entity
authorLucas Bajolet <r4pass@hotmail.com>
Tue, 8 Dec 2015 18:29:20 +0000 (13:29 -0500)
committerLucas Bajolet <r4pass@hotmail.com>
Tue, 29 Dec 2015 04:49:28 +0000 (23:49 -0500)
Signed-off-by: Lucas Bajolet <r4pass@hotmail.com>

lib/core/text/flat.nit
lib/core/text/native.nit

index 6e87257..6e1e610 100644 (file)
@@ -50,10 +50,24 @@ redef class FlatText
 
        # Index of the character `index` in `_items`
        fun char_to_byte_index(index: Int): Int do
-               var ln = length
-               assert index >= 0
-               assert index < ln
+               var dpos = index - _position
+               var b = _bytepos
+
+               if dpos == 0 then return b
+               if dpos == 1 then
+                       b += _items.length_of_char_at(b)
+                       _bytepos = b
+                       _position = index
+                       return b
+               end
+               if dpos == -1 then
+                       b = _items.find_beginning_of_char_at(b - 1)
+                       _bytepos = b
+                       _position = index
+                       return b
+               end
 
+               var ln = _length
                var pos = _position
                # Find best insertion point
                var delta_begin = index
@@ -68,15 +82,15 @@ redef class FlatText
                var ns_i: Int
                var my_i: Int
 
-               if min == delta_begin then
-                       ns_i = first_byte
-                       my_i = 0
-               else if min == delta_cache then
+               if min == delta_cache then
                        ns_i = _bytepos
                        my_i = pos
+               else if min == delta_begin then
+                       ns_i = first_byte
+                       my_i = 0
                else
                        ns_i = its.find_beginning_of_char_at(last_byte)
-                       my_i = length - 1
+                       my_i = _length - 1
                end
 
                ns_i = its.char_to_byte_index_cached(index, my_i, ns_i)
@@ -277,7 +291,21 @@ redef class FlatText
                return nns.to_s_unsafe(nlen)
        end
 
-       redef fun [](index) do return _items.char_at(char_to_byte_index(index))
+       redef fun [](index) do
+               assert index >= 0 and index < _length
+               return fetch_char_at(index)
+       end
+
+       # Gets a `Char` at `index` in `self`
+       #
+       # WARNING: Use at your own risks as no bound-checking is done
+       fun fetch_char_at(index: Int): Char do
+               var i = char_to_byte_index(index)
+               var items = _items
+               var b = items[i]
+               if b & 0x80u8 == 0x00u8 then return b.ascii
+               return items.char_at(i)
+       end
 
        # If `self` contains only digits and alpha <= 'f', return the corresponding integer.
        #
@@ -317,11 +345,12 @@ class FlatString
                return new_items
        end
 
-       redef fun reversed
-       do
+       redef fun reversed do
                var b = new FlatBuffer.with_capacity(_bytelen + 1)
-               for i in [0 .. _length[.step(-1) do
-                       b.add self[i]
+               var i = _length - 1
+               while i >= 0 do
+                       b.add self.fetch_char_at(i)
+                       i -= 1
                end
                var s = b.to_s.as(FlatString)
                s._length = self._length
@@ -651,10 +680,9 @@ private class FlatStringByteView
        do
                # Check that the index (+ _first_byte) is not larger than last_byte
                # In other terms, if the index is valid
-               assert index >= 0
-               var target = self.target
+               var target = _target
+               assert index >= 0 and index < target._bytelen
                var ind = index + target._first_byte
-               assert ind <= target.last_byte
                return target._items[ind]
        end
 
@@ -747,7 +775,6 @@ class FlatBuffer
                        lshift_bytes(ip + clen, -size_diff)
                end
                _bytelen += size_diff
-               bytepos += size_diff
                it.set_char_at(ip, item)
        end
 
index d370ee8..fad3ae1 100644 (file)
@@ -112,14 +112,34 @@ extern class NativeString `{ char* `}
        # ~~~raw
        #     assert "かきく".as(FlatString).items.char_at(1) == '�'
        # ~~~
-       fun char_at(pos: Int): Char `{
-               char c = self[pos];
-               if((c & 0x80) == 0x00) return (uint32_t)c;
-               if(((c & 0xE0) == 0xC0) && ((self[pos + 1] & 0xC0) == 0x80)) return ((((uint32_t)c) & 0x1F) << 6) + ((((uint32_t)self[pos + 1] & 0x3F)));
-               if(((c & 0xF0) == 0xE0) && ((self[pos + 1] & 0xC0) == 0x80) && ((self[pos + 2] & 0xC0) == 0x80)) return ((((uint32_t)c) & 0xF) << 12) + ((((uint32_t)self[pos + 1]) & 0x3F) << 6) + ((((uint32_t)self[pos + 2] & 0x3F)));
-               if(((c & 0xF8) == 0xF0) && ((self[pos + 1] & 0xC0) == 0x80) && ((self[pos + 2] & 0xC0) == 0x80) && ((self[pos + 3] & 0xC0) == 0x80)) return ((((uint32_t)c) & 0x7) << 18) + ((((uint32_t)self[pos + 1]) & 0x3F) << 12) + ((((uint32_t)self[pos + 2]) & 0x3F) << 6) + ((((uint32_t)self[pos + 3] & 0x3F)));
-               return 0xFFFD;
-       `}
+       fun char_at(pos: Int): Char do
+               var c = self[pos]
+               if c & 0x80u8 == 0u8 then return c.ascii
+               var b = fetch_4_hchars(pos)
+               var ret = 0
+               if b & 0xC00000 != 0x800000 then return 0xFFFD.code_point
+               if b & 0xE0000000 == 0xC0000000 then
+                       ret |= (b & 0x1F000000) >> 18
+                       ret |= (b & 0x3F0000) >> 16
+                       return ret.code_point
+               end
+               if not b & 0xC000 == 0x8000 then return 0xFFFD.code_point
+               if b & 0xF0000000 == 0xE0000000 then
+                       ret |= (b & 0xF000000) >> 12
+                       ret |= (b & 0x3F0000) >> 10
+                       ret |= (b & 0x3F00) >> 8
+                       return ret.code_point
+               end
+               if not b & 0xC0 == 0x80 then return 0xFFFD.code_point
+               if b & 0xF8000000 == 0xF0000000 then
+                       ret |= (b.to_i & 0x7000000) >> 6
+                       ret |= (b.to_i & 0x3F0000) >> 4
+                       ret |= (b.to_i & 0x3F00) >> 2
+                       ret |= b.to_i & 0x3F
+                       return ret.code_point
+               end
+               return 0xFFFD.code_point
+       end
 
        # Gets the byte index of char at position `n` in UTF-8 String
        fun char_to_byte_index(n: Int): Int do return char_to_byte_index_cached(n, 0, 0)
@@ -150,14 +170,34 @@ extern class NativeString `{ char* `}
                var ns_i = byte_from
                var my_i = char_from
 
-               while my_i < n do
+               var dist = n - my_i
+
+               while dist > 0 do
+                       while dist >= 4 do
+                               var i = fetch_4_chars(ns_i)
+                               if i & 0x80808080 != 0 then break
+                               ns_i += 4
+                               my_i += 4
+                               dist -= 4
+                       end
+                       if dist == 0 then break
                        ns_i += length_of_char_at(ns_i)
                        my_i += 1
+                       dist -= 1
                end
 
-               while my_i > n do
+               while dist < 0 do
+                       while dist <= -4 do
+                               var i = fetch_4_chars(ns_i - 4)
+                               if i & 0x80808080 != 0 then break
+                               ns_i -= 4
+                               my_i -= 4
+                               dist += 4
+                       end
+                       if dist == 0 then break
                        ns_i = find_beginning_of_char_at(ns_i - 1)
                        my_i -= 1
+                       dist += 1
                end
 
                return ns_i