# Index of the character `index` in `_items`
fun char_to_byte_index(index: Int): Int do
- var ln = length
- assert index >= 0
- assert index < ln
+ var dpos = index - _position
+ var b = _bytepos
+ var its = _items
+ if dpos == 1 then
+ if its[b] & 0x80u8 == 0x00u8 then
+ b += 1
+ else
+ b += its.length_of_char_at(b)
+ end
+ _bytepos = b
+ _position = index
+ return b
+ end
+ if dpos == -1 then
+ b = its.find_beginning_of_char_at(b - 1)
+ _bytepos = b
+ _position = index
+ return b
+ end
+ if dpos == 0 then return b
+
+ var ln = _length
var pos = _position
# Find best insertion point
var delta_begin = index
var delta_end = (ln - 1) - index
var delta_cache = (pos - index).abs
var min = delta_begin
- var its = _items
if delta_cache < min then min = delta_cache
if delta_end < min then min = delta_end
var ns_i: Int
var my_i: Int
- if min == delta_begin then
- ns_i = first_byte
- my_i = 0
- else if min == delta_cache then
+ if min == delta_cache then
ns_i = _bytepos
my_i = pos
+ else if min == delta_begin then
+ ns_i = first_byte
+ my_i = 0
else
ns_i = its.find_beginning_of_char_at(last_byte)
- my_i = length - 1
+ my_i = _length - 1
end
ns_i = its.char_to_byte_index_cached(index, my_i, ns_i)
end
pos += 1
end
- return nns.to_s_with_length(nlen)
+ return nns.to_s_unsafe(nlen)
+ end
+
+ redef fun [](index) do
+ var len = _length
+
+ # Statistically:
+ # * ~70% want the next char
+ # * ~23% want the previous
+ # * ~7% want the same char
+ #
+ # So it makes sense to shortcut early. And early is here.
+ var dpos = index - _position
+ var b = _bytepos
+ if dpos == 1 and index < len - 1 then
+ var its = _items
+ var c = its[b]
+ if c & 0x80u8 == 0x00u8 then
+ # We want the next, and current is easy.
+ # So next is easy to find!
+ b += 1
+ _position = index
+ _bytepos = b
+ # The rest will be done by `dpos==0` bellow.
+ dpos = 0
+ end
+ else if dpos == -1 and index > 1 then
+ var its = _items
+ var c = its[b-1]
+ if c & 0x80u8 == 0x00u8 then
+ # We want the previous, and it is easy.
+ b -= 1
+ dpos = 0
+ _position = index
+ _bytepos = b
+ return c.ascii
+ end
+ end
+ if dpos == 0 then
+ # We know what we want (+0 or +1) just get it now!
+ var its = _items
+ var c = its[b]
+ if c & 0x80u8 == 0x00u8 then return c.ascii
+ return items.char_at(b)
+ end
+
+ assert index >= 0 and index < len
+ return fetch_char_at(index)
end
- redef fun [](index) do return _items.char_at(char_to_byte_index(index))
+ # Gets a `Char` at `index` in `self`
+ #
+ # WARNING: Use at your own risks as no bound-checking is done
+ fun fetch_char_at(index: Int): Char do
+ var i = char_to_byte_index(index)
+ var items = _items
+ var b = items[i]
+ if b & 0x80u8 == 0x00u8 then return b.ascii
+ return items.char_at(i)
+ end
# If `self` contains only digits and alpha <= 'f', return the corresponding integer.
#
end
# Immutable strings of characters.
-class FlatString
+abstract class FlatString
super FlatText
super String
return new_items
end
- redef fun reversed
- do
+ redef fun reversed do
var b = new FlatBuffer.with_capacity(_bytelen + 1)
- for i in [0 .. _length[.step(-1) do
- b.add self[i]
+ var i = _length - 1
+ while i >= 0 do
+ b.add self.fetch_char_at(i)
+ i -= 1
end
var s = b.to_s.as(FlatString)
s._length = self._length
redef fun substring(from, count)
do
- assert count >= 0
+ if count <= 0 then return ""
if from < 0 then
count += from
- if count < 0 then count = 0
+ if count <= 0 then return ""
from = 0
end
- if (count + from) > length then count = length - from
+ var ln = _length
+ if (count + from) > ln then count = ln - from
if count <= 0 then return ""
var end_index = from + count - 1
+ return substring_impl(from, count, end_index)
+ end
+
+ private fun substring_impl(from, count, end_index: Int): String do
+ var cache = _position
+ var dfrom = (cache - from).abs
+ var dend = (end_index - from).abs
+
+ var bytefrom: Int
+ var byteto: Int
+ if dfrom < dend then
+ bytefrom = char_to_byte_index(from)
+ byteto = char_to_byte_index(end_index)
+ else
+ byteto = char_to_byte_index(end_index)
+ bytefrom = char_to_byte_index(from)
+ end
- var bytefrom = char_to_byte_index(from)
- var byteto = char_to_byte_index(end_index)
var its = _items
byteto += its.length_of_char_at(byteto) - 1
- var s = new FlatString.full(its, byteto - bytefrom + 1, bytefrom, byteto, count)
+ var s = new FlatString.full(its, byteto - bytefrom + 1, bytefrom, count)
return s
end
#
# `_items` will be used as is, without copy, to retrieve the characters of the string.
# Aliasing issues is the responsibility of the caller.
- private init with_infos(items: NativeString, bytelen, from: Int)
+ private new with_infos(items: NativeString, bytelen, from: Int)
do
- self._items = items
- self._bytelen = bytelen
- _first_byte = from
- _bytepos = from
- _length = _items.utf8_length(_first_byte, last_byte)
+ var len = items.utf8_length(from, bytelen)
+ if bytelen == len then return new ASCIIFlatString.full_data(items, bytelen, from, len)
+ return new UnicodeFlatString.full_data(items, bytelen, from, len)
end
# Low-level creation of a new string with all the data.
#
# `_items` will be used as is, without copy, to retrieve the characters of the string.
# Aliasing issues is the responsibility of the caller.
- private init full(items: NativeString, bytelen, from, length: Int)
+ private new full(items: NativeString, bytelen, from, length: Int)
do
- self._items = items
- self._length = length
- self._bytelen = bytelen
- _first_byte = from
- _bytepos = from
+ if bytelen == length then return new ASCIIFlatString.full_data(items, bytelen, from, length)
+ return new UnicodeFlatString.full_data(items, bytelen, from, length)
end
redef fun ==(other)
return new FlatString.full(ns, new_bytelen, 0, newlen)
end
-
redef fun hash
do
if hash_cache == null then
redef fun substrings do return new FlatSubstringsIter(self)
end
+# Regular Nit UTF-8 strings
+private class UnicodeFlatString
+ super FlatString
+
+ init full_data(items: NativeString, bytelen, from, length: Int) do
+ self._items = items
+ self._length = length
+ self._bytelen = bytelen
+ _first_byte = from
+ _bytepos = from
+ end
+
+ redef fun substring_from(from) do
+ if from >= self._length then return empty
+ if from <= 0 then return self
+ var c = char_to_byte_index(from)
+ var st = c - _first_byte
+ var fln = bytelen - st
+ return new FlatString.full(items, fln, c, _length - from)
+ end
+end
+
+# Special cases of String where all the characters are ASCII-based
+#
+# Optimizes access operations to O(1) complexity.
+private class ASCIIFlatString
+ super FlatString
+
+ init full_data(items: NativeString, bytelen, from, length: Int) do
+ self._items = items
+ self._length = length
+ self._bytelen = bytelen
+ _first_byte = from
+ _bytepos = from
+ end
+
+ redef fun [](idx) do
+ assert idx < _bytelen and idx >= 0
+ return _items[idx + _first_byte].ascii
+ end
+
+ redef fun substring(from, count) do
+ if count <= 0 then return ""
+
+ if from < 0 then
+ count += from
+ if count < 0 then return ""
+ from = 0
+ end
+ var ln = _length
+ if (count + from) > ln then count = ln - from
+ return new ASCIIFlatString.full_data(_items, count, from + _first_byte, count)
+ end
+
+ redef fun reversed do
+ var b = new FlatBuffer.with_capacity(_bytelen + 1)
+ var i = _length - 1
+ while i >= 0 do
+ b.add self[i]
+ i -= 1
+ end
+ var s = b.to_s.as(FlatString)
+ return s
+ end
+
+ redef fun char_to_byte_index(index) do return index + _first_byte
+
+ redef fun substring_impl(from, count, end_index) do
+ return new ASCIIFlatString.full_data(_items, count, from + _first_byte, count)
+ end
+
+ redef fun fetch_char_at(i) do return _items[i + _first_byte].ascii
+end
+
private class FlatStringCharReverseIterator
super IndexedIterator[Char]
do
# Check that the index (+ _first_byte) is not larger than last_byte
# In other terms, if the index is valid
- assert index >= 0
- var target = self.target
+ var target = _target
+ assert index >= 0 and index < target._bytelen
var ind = index + target._first_byte
- assert ind <= target.last_byte
return target._items[ind]
end
lshift_bytes(ip + clen, -size_diff)
end
_bytelen += size_diff
- bytepos += size_diff
it.set_char_at(ip, item)
end
redef fun clear do
is_dirty = true
- if written then reset
_bytelen = 0
_length = 0
+ if written then reset
end
redef fun empty do return new Buffer
do
var c = capacity
if cap <= c then return
- while c <= cap do c = c * 2 + 2
+ if c <= 16 then c = 16
+ while c <= cap do c = c * 2
# The COW flag can be set at false here, since
# it does a copy of the current `Buffer`
written = false
var bln = _bytelen
- var a = new NativeString(c+1)
+ var a = new NativeString(c)
if bln > 0 then
var it = _items
if bln > 0 then it.copy_to(a, bln, 0, 0)
init with_capacity(cap: Int)
do
assert cap >= 0
- _items = new NativeString(cap + 1)
+ _items = new NativeString(cap)
capacity = cap
_bytelen = 0
end
return new FlatBuffer.with_infos(r_items, byte_length, byte_length, count)
end
+ redef fun append_substring_impl(s, from, length) do
+ if length <= 0 then return
+ if not s isa FlatText then
+ super
+ return
+ end
+ var bytest = s.char_to_byte_index(from)
+ var bytend = s.char_to_byte_index(from + length - 1)
+ var btln = bytend - bytest + 1
+ enlarge(btln + _bytelen)
+ s._items.copy_to(_items, btln, bytest, _bytelen)
+ _bytelen += btln
+ _length += length
+ end
+
redef fun reverse
do
written = false
return to_s_with_length(cstring_length)
end
- # Returns `self` as a String of `length`.
- redef fun to_s_with_length(length): FlatString
+ redef fun to_s_with_length(length)
do
assert length >= 0
return clean_utf8(length)
return new FlatString.full(self, bytelen, 0, unilen)
end
- # Returns `self` as a new String.
- redef fun to_s_with_copy: FlatString
+ redef fun to_s_unsafe(len) do
+ if len == null then len = cstring_length
+ return new FlatString.with_infos(self, len, 0)
+ end
+
+ redef fun to_s_with_copy do return to_s_with_copy_and_length(cstring_length)
+
+ # Get a `String` from `length` bytes at `self` copied into Nit memory
+ fun to_s_with_copy_and_length(length: Int): String
do
- var length = cstring_length
var r = clean_utf8(length)
if r.items != self then return r
var new_self = new NativeString(length + 1)
var end_length = len
var pos = 0
var chr_ln = 0
- while pos < len do
+ var rem = len
+ while rem > 0 do
+ while rem >= 4 do
+ var i = fetch_4_chars(pos)
+ if i & 0x80808080 != 0 then break
+ pos += 4
+ chr_ln += 4
+ rem -= 4
+ end
+ if rem == 0 then break
var b = self[pos]
+ if b & 0x80u8 == 0x00u8 then
+ pos += 1
+ chr_ln += 1
+ rem -= 1
+ continue
+ end
var nxst = length_of_char_at(pos)
var ok_st: Bool
if nxst == 1 then
replacements.add pos
end_length += 2
pos += 1
+ rem -= 1
chr_ln += 1
continue
end
end_length += 2
pos += 1
chr_ln += 1
+ rem -= 1
continue
end
- pos += c.u8char_len
+ var clen = c.u8char_len
+ pos += clen
+ rem -= clen
chr_ln += 1
end
var ret = self
#
# Very unsafe, make sure to have room for this char prior to calling this function.
private fun set_char_at(pos: Int, c: Char) do
+ if c.code_point < 128 then
+ self[pos] = c.code_point.to_b
+ return
+ end
var ln = c.u8char_len
native_set_char(pos, c, ln)
end
end
redef class Int
- redef fun to_base(base, signed)
- do
- var l = digit_count(base)
- var s = new FlatBuffer.from(" " * l)
- fill_buffer(s, base, signed)
- return s.to_s
- end
-
# return displayable int in base 10 and signed
#
# assert 1.to_s == "1"