From 8c67cf8050f321e6bbff09a3f8865f3e32bf9d5b Mon Sep 17 00:00:00 2001 From: Lucas Bajolet Date: Tue, 8 Dec 2015 13:29:20 -0500 Subject: [PATCH] lib/core: Improve speed of an indexed access in a UTF-8 `Text` entity Signed-off-by: Lucas Bajolet --- lib/core/text/flat.nit | 61 +++++++++++++++++++++++++++++++++------------- lib/core/text/native.nit | 60 +++++++++++++++++++++++++++++++++++++-------- 2 files changed, 94 insertions(+), 27 deletions(-) diff --git a/lib/core/text/flat.nit b/lib/core/text/flat.nit index 6e87257..6e1e610 100644 --- a/lib/core/text/flat.nit +++ b/lib/core/text/flat.nit @@ -50,10 +50,24 @@ redef class FlatText # Index of the character `index` in `_items` fun char_to_byte_index(index: Int): Int do - var ln = length - assert index >= 0 - assert index < ln + var dpos = index - _position + var b = _bytepos + + if dpos == 0 then return b + if dpos == 1 then + b += _items.length_of_char_at(b) + _bytepos = b + _position = index + return b + end + if dpos == -1 then + b = _items.find_beginning_of_char_at(b - 1) + _bytepos = b + _position = index + return b + end + var ln = _length var pos = _position # Find best insertion point var delta_begin = index @@ -68,15 +82,15 @@ redef class FlatText var ns_i: Int var my_i: Int - if min == delta_begin then - ns_i = first_byte - my_i = 0 - else if min == delta_cache then + if min == delta_cache then ns_i = _bytepos my_i = pos + else if min == delta_begin then + ns_i = first_byte + my_i = 0 else ns_i = its.find_beginning_of_char_at(last_byte) - my_i = length - 1 + my_i = _length - 1 end ns_i = its.char_to_byte_index_cached(index, my_i, ns_i) @@ -277,7 +291,21 @@ redef class FlatText return nns.to_s_unsafe(nlen) end - redef fun [](index) do return _items.char_at(char_to_byte_index(index)) + redef fun [](index) do + assert index >= 0 and index < _length + return fetch_char_at(index) + end + + # Gets a `Char` at `index` in `self` + # + # WARNING: Use at your own risks as no bound-checking is done + fun fetch_char_at(index: Int): Char do + var i = char_to_byte_index(index) + var items = _items + var b = items[i] + if b & 0x80u8 == 0x00u8 then return b.ascii + return items.char_at(i) + end # If `self` contains only digits and alpha <= 'f', return the corresponding integer. # @@ -317,11 +345,12 @@ class FlatString return new_items end - redef fun reversed - do + redef fun reversed do var b = new FlatBuffer.with_capacity(_bytelen + 1) - for i in [0 .. _length[.step(-1) do - b.add self[i] + var i = _length - 1 + while i >= 0 do + b.add self.fetch_char_at(i) + i -= 1 end var s = b.to_s.as(FlatString) s._length = self._length @@ -651,10 +680,9 @@ private class FlatStringByteView do # Check that the index (+ _first_byte) is not larger than last_byte # In other terms, if the index is valid - assert index >= 0 - var target = self.target + var target = _target + assert index >= 0 and index < target._bytelen var ind = index + target._first_byte - assert ind <= target.last_byte return target._items[ind] end @@ -747,7 +775,6 @@ class FlatBuffer lshift_bytes(ip + clen, -size_diff) end _bytelen += size_diff - bytepos += size_diff it.set_char_at(ip, item) end diff --git a/lib/core/text/native.nit b/lib/core/text/native.nit index d370ee8..fad3ae1 100644 --- a/lib/core/text/native.nit +++ b/lib/core/text/native.nit @@ -112,14 +112,34 @@ extern class NativeString `{ char* `} # ~~~raw # assert "かきく".as(FlatString).items.char_at(1) == '�' # ~~~ - fun char_at(pos: Int): Char `{ - char c = self[pos]; - if((c & 0x80) == 0x00) return (uint32_t)c; - if(((c & 0xE0) == 0xC0) && ((self[pos + 1] & 0xC0) == 0x80)) return ((((uint32_t)c) & 0x1F) << 6) + ((((uint32_t)self[pos + 1] & 0x3F))); - if(((c & 0xF0) == 0xE0) && ((self[pos + 1] & 0xC0) == 0x80) && ((self[pos + 2] & 0xC0) == 0x80)) return ((((uint32_t)c) & 0xF) << 12) + ((((uint32_t)self[pos + 1]) & 0x3F) << 6) + ((((uint32_t)self[pos + 2] & 0x3F))); - if(((c & 0xF8) == 0xF0) && ((self[pos + 1] & 0xC0) == 0x80) && ((self[pos + 2] & 0xC0) == 0x80) && ((self[pos + 3] & 0xC0) == 0x80)) return ((((uint32_t)c) & 0x7) << 18) + ((((uint32_t)self[pos + 1]) & 0x3F) << 12) + ((((uint32_t)self[pos + 2]) & 0x3F) << 6) + ((((uint32_t)self[pos + 3] & 0x3F))); - return 0xFFFD; - `} + fun char_at(pos: Int): Char do + var c = self[pos] + if c & 0x80u8 == 0u8 then return c.ascii + var b = fetch_4_hchars(pos) + var ret = 0 + if b & 0xC00000 != 0x800000 then return 0xFFFD.code_point + if b & 0xE0000000 == 0xC0000000 then + ret |= (b & 0x1F000000) >> 18 + ret |= (b & 0x3F0000) >> 16 + return ret.code_point + end + if not b & 0xC000 == 0x8000 then return 0xFFFD.code_point + if b & 0xF0000000 == 0xE0000000 then + ret |= (b & 0xF000000) >> 12 + ret |= (b & 0x3F0000) >> 10 + ret |= (b & 0x3F00) >> 8 + return ret.code_point + end + if not b & 0xC0 == 0x80 then return 0xFFFD.code_point + if b & 0xF8000000 == 0xF0000000 then + ret |= (b.to_i & 0x7000000) >> 6 + ret |= (b.to_i & 0x3F0000) >> 4 + ret |= (b.to_i & 0x3F00) >> 2 + ret |= b.to_i & 0x3F + return ret.code_point + end + return 0xFFFD.code_point + end # Gets the byte index of char at position `n` in UTF-8 String fun char_to_byte_index(n: Int): Int do return char_to_byte_index_cached(n, 0, 0) @@ -150,14 +170,34 @@ extern class NativeString `{ char* `} var ns_i = byte_from var my_i = char_from - while my_i < n do + var dist = n - my_i + + while dist > 0 do + while dist >= 4 do + var i = fetch_4_chars(ns_i) + if i & 0x80808080 != 0 then break + ns_i += 4 + my_i += 4 + dist -= 4 + end + if dist == 0 then break ns_i += length_of_char_at(ns_i) my_i += 1 + dist -= 1 end - while my_i > n do + while dist < 0 do + while dist <= -4 do + var i = fetch_4_chars(ns_i - 4) + if i & 0x80808080 != 0 then break + ns_i -= 4 + my_i -= 4 + dist += 4 + end + if dist == 0 then break ns_i = find_beginning_of_char_at(ns_i - 1) my_i -= 1 + dist += 1 end return ns_i -- 1.7.9.5