fun char_to_byte_index(index: Int): Int do
var dpos = index - _position
var b = _bytepos
+ var its = _items
- if dpos == 0 then return b
if dpos == 1 then
- b += _items.length_of_char_at(b)
+ if its[b] & 0x80u8 == 0x00u8 then
+ b += 1
+ else
+ b += its.length_of_char_at(b)
+ end
_bytepos = b
_position = index
return b
end
if dpos == -1 then
- b = _items.find_beginning_of_char_at(b - 1)
+ b = its.find_beginning_of_char_at(b - 1)
_bytepos = b
_position = index
return b
end
+ if dpos == 0 then return b
var ln = _length
var pos = _position
var delta_end = (ln - 1) - index
var delta_cache = (pos - index).abs
var min = delta_begin
- var its = _items
if delta_cache < min then min = delta_cache
if delta_end < min then min = delta_end
end
redef fun [](index) do
- assert index >= 0 and index < _length
+ var len = _length
+
+ # Statistically:
+ # * ~70% want the next char
+ # * ~23% want the previous
+ # * ~7% want the same char
+ #
+ # So it makes sense to shortcut early. And early is here.
+ var dpos = index - _position
+ var b = _bytepos
+ if dpos == 1 and index < len - 1 then
+ var its = _items
+ var c = its[b]
+ if c & 0x80u8 == 0x00u8 then
+ # We want the next, and current is easy.
+ # So next is easy to find!
+ b += 1
+ _position = index
+ _bytepos = b
+ # The rest will be done by `dpos==0` bellow.
+ dpos = 0
+ end
+ else if dpos == -1 and index > 1 then
+ var its = _items
+ var c = its[b-1]
+ if c & 0x80u8 == 0x00u8 then
+ # We want the previous, and it is easy.
+ b -= 1
+ dpos = 0
+ _position = index
+ _bytepos = b
+ return c.ascii
+ end
+ end
+ if dpos == 0 then
+ # We know what we want (+0 or +1) just get it now!
+ var its = _items
+ var c = its[b]
+ if c & 0x80u8 == 0x00u8 then return c.ascii
+ return items.char_at(b)
+ end
+
+ assert index >= 0 and index < len
return fetch_char_at(index)
end
end
# Immutable strings of characters.
-class FlatString
+abstract class FlatString
super FlatText
super String
redef fun fast_cstring do return _items.fast_cstring(_first_byte)
- redef fun substring_from(from) do
- if from >= self._length then return empty
- if from <= 0 then return self
- var c = char_to_byte_index(from)
- var st = c - _first_byte
- var fln = bytelen - st
- return new FlatString.full(items, fln, c, _length - from)
- end
-
redef fun substring(from, count)
do
- assert count >= 0
+ if count <= 0 then return ""
if from < 0 then
count += from
- if count < 0 then return ""
+ if count <= 0 then return ""
from = 0
end
#
# `_items` will be used as is, without copy, to retrieve the characters of the string.
# Aliasing issues is the responsibility of the caller.
- private init with_infos(items: NativeString, bytelen, from: Int)
+ private new with_infos(items: NativeString, bytelen, from: Int)
do
- self._items = items
- self._bytelen = bytelen
- _first_byte = from
- _bytepos = from
- _length = _items.utf8_length(_first_byte, bytelen)
+ var len = items.utf8_length(from, bytelen)
+ if bytelen == len then return new ASCIIFlatString.full_data(items, bytelen, from, len)
+ return new UnicodeFlatString.full_data(items, bytelen, from, len)
end
# Low-level creation of a new string with all the data.
#
# `_items` will be used as is, without copy, to retrieve the characters of the string.
# Aliasing issues is the responsibility of the caller.
- private init full(items: NativeString, bytelen, from, length: Int)
+ private new full(items: NativeString, bytelen, from, length: Int)
do
- self._items = items
- self._length = length
- self._bytelen = bytelen
- _first_byte = from
- _bytepos = from
+ if bytelen == length then return new ASCIIFlatString.full_data(items, bytelen, from, length)
+ return new UnicodeFlatString.full_data(items, bytelen, from, length)
end
redef fun ==(other)
return new FlatString.full(ns, new_bytelen, 0, newlen)
end
-
redef fun hash
do
if hash_cache == null then
redef fun substrings do return new FlatSubstringsIter(self)
end
+# Regular Nit UTF-8 strings
+private class UnicodeFlatString
+ super FlatString
+
+ init full_data(items: NativeString, bytelen, from, length: Int) do
+ self._items = items
+ self._length = length
+ self._bytelen = bytelen
+ _first_byte = from
+ _bytepos = from
+ end
+
+ redef fun substring_from(from) do
+ if from >= self._length then return empty
+ if from <= 0 then return self
+ var c = char_to_byte_index(from)
+ var st = c - _first_byte
+ var fln = bytelen - st
+ return new FlatString.full(items, fln, c, _length - from)
+ end
+end
+
+# Special cases of String where all the characters are ASCII-based
+#
+# Optimizes access operations to O(1) complexity.
+private class ASCIIFlatString
+ super FlatString
+
+ init full_data(items: NativeString, bytelen, from, length: Int) do
+ self._items = items
+ self._length = length
+ self._bytelen = bytelen
+ _first_byte = from
+ _bytepos = from
+ end
+
+ redef fun [](idx) do
+ assert idx < _bytelen and idx >= 0
+ return _items[idx + _first_byte].ascii
+ end
+
+ redef fun substring(from, count) do
+ var ln = _length
+ if count <= 0 then return ""
+ if (count + from) > ln then count = ln - from
+ if count <= 0 then return ""
+ if from < 0 then
+ count += from
+ if count <= 0 then return ""
+ from = 0
+ end
+ return new ASCIIFlatString.full_data(_items, count, from + _first_byte, count)
+ end
+
+ redef fun reversed do
+ var b = new FlatBuffer.with_capacity(_bytelen + 1)
+ var i = _length - 1
+ while i >= 0 do
+ b.add self[i]
+ i -= 1
+ end
+ var s = b.to_s.as(FlatString)
+ return s
+ end
+
+ redef fun char_to_byte_index(index) do return index + _first_byte
+
+ redef fun substring_impl(from, count, end_index) do
+ return new ASCIIFlatString.full_data(_items, count, from + _first_byte, count)
+ end
+
+ redef fun fetch_char_at(i) do return _items[i + _first_byte].ascii
+end
+
private class FlatStringCharReverseIterator
super IndexedIterator[Char]
redef fun clear do
is_dirty = true
- if written then reset
_bytelen = 0
_length = 0
+ if written then
+ _capacity = 16
+ reset
+ end
end
redef fun empty do return new Buffer
do
var c = capacity
if cap <= c then return
- while c <= cap do c = c * 2 + 2
+ if c <= 16 then c = 16
+ while c <= cap do c = c * 2
# The COW flag can be set at false here, since
# it does a copy of the current `Buffer`
written = false
var bln = _bytelen
- var a = new NativeString(c+1)
+ var a = new NativeString(c)
if bln > 0 then
var it = _items
if bln > 0 then it.copy_to(a, bln, 0, 0)
init from(s: Text)
do
_items = new NativeString(s.bytelen)
- if s isa FlatText then
- _items = s._items
- else
- for i in substrings do i.as(FlatString)._items.copy_to(_items, i._bytelen, 0, 0)
- end
+ for i in s.substrings do i._items.copy_to(_items, i._bytelen, first_byte, 0)
_bytelen = s.bytelen
_length = s.length
_capacity = _bytelen
- written = true
end
# Create a new empty string with a given capacity.
init with_capacity(cap: Int)
do
assert cap >= 0
- _items = new NativeString(cap + 1)
+ _items = new NativeString(cap)
capacity = cap
_bytelen = 0
end
return new FlatBuffer.with_infos(r_items, byte_length, byte_length, count)
end
+ redef fun append_substring_impl(s, from, length) do
+ if length <= 0 then return
+ if not s isa FlatText then
+ super
+ return
+ end
+ var bytest = s.char_to_byte_index(from)
+ var bytend = s.char_to_byte_index(from + length - 1)
+ var btln = bytend - bytest + 1
+ enlarge(btln + _bytelen)
+ s._items.copy_to(_items, btln, bytest, _bytelen)
+ _bytelen += btln
+ _length += length
+ end
+
redef fun reverse
do
written = false
return to_s_with_length(cstring_length)
end
- # Returns `self` as a String of `length`.
- redef fun to_s_with_length(length): FlatString
+ redef fun to_s_with_length(length)
do
assert length >= 0
return clean_utf8(length)
return new FlatString.with_infos(self, len, 0)
end
- # Returns `self` as a new String.
- redef fun to_s_with_copy: FlatString
+ redef fun to_s_with_copy do return to_s_with_copy_and_length(cstring_length)
+
+ # Get a `String` from `length` bytes at `self` copied into Nit memory
+ fun to_s_with_copy_and_length(length: Int): String
do
- var length = cstring_length
var r = clean_utf8(length)
if r.items != self then return r
var new_self = new NativeString(length + 1)
#
# Very unsafe, make sure to have room for this char prior to calling this function.
private fun set_char_at(pos: Int, c: Char) do
+ var cp = c.code_point
+ if cp < 128 then
+ self[pos] = cp.to_b
+ return
+ end
var ln = c.u8char_len
- native_set_char(pos, c, ln)
- end
-
- private fun native_set_char(pos: Int, c: Char, ln: Int) `{
- char* dst = self + pos;
- switch(ln){
- case 1:
- dst[0] = c;
- break;
- case 2:
- dst[0] = 0xC0 | ((c & 0x7C0) >> 6);
- dst[1] = 0x80 | (c & 0x3F);
- break;
- case 3:
- dst[0] = 0xE0 | ((c & 0xF000) >> 12);
- dst[1] = 0x80 | ((c & 0xFC0) >> 6);
- dst[2] = 0x80 | (c & 0x3F);
- break;
- case 4:
- dst[0] = 0xF0 | ((c & 0x1C0000) >> 18);
- dst[1] = 0x80 | ((c & 0x3F000) >> 12);
- dst[2] = 0x80 | ((c & 0xFC0) >> 6);
- dst[3] = 0x80 | (c & 0x3F);
- break;
- }
- `}
+ if ln == 2 then
+ self[pos] = (0xC0 | ((cp & 0x7C0) >> 6)).to_b
+ self[pos + 1] = (0x80 | (cp & 0x3F)).to_b
+ else if ln == 3 then
+ self[pos] = (0xE0 | ((cp & 0xF000) >> 12)).to_b
+ self[pos + 1] = (0x80 | ((cp & 0xFC0) >> 6)).to_b
+ self[pos + 2] = (0x80 | (cp & 0x3F)).to_b
+ else if ln == 4 then
+ self[pos] = (0xF0 | ((cp & 0x1C0000) >> 18)).to_b
+ self[pos + 1] = (0x80 | ((cp & 0x3F000) >> 12)).to_b
+ self[pos + 2] = (0x80 | ((cp & 0xFC0) >> 6)).to_b
+ self[pos + 3] = (0x80 | (cp & 0x3F)).to_b
+ end
+ end
end
redef class Int
- redef fun to_base(base, signed)
- do
- var l = digit_count(base)
- var s = new FlatBuffer.from(" " * l)
- fill_buffer(s, base, signed)
- return s.to_s
- end
-
# return displayable int in base 10 and signed
#
# assert 1.to_s == "1"