lib/core: Added new `append_substring` service to avoid creating ephemeral instances

[nit.git] / lib / core / text / flat.nit
diff --git a/lib/core/text/flat.nit b/lib/core/text/flat.nit

index 22707d0..83cae37 100644 (file)
--- a/lib/core/text/flat.nit
+++ b/lib/core/text/flat.nit
@@ -50,17 +50,35 @@ redef class FlatText
  
         # Index of the character `index` in `_items`
         fun char_to_byte_index(index: Int): Int do
-               var ln = length
-               assert index >= 0
-               assert index < ln
+               var dpos = index - _position
+               var b = _bytepos
+               var its = _items
  
+               if dpos == 1 then
+                       if its[b] & 0x80u8 == 0x00u8 then
+                               b += 1
+                       else
+                               b += its.length_of_char_at(b)
+                       end
+                       _bytepos = b
+                       _position = index
+                       return b
+               end
+               if dpos == -1 then
+                       b = its.find_beginning_of_char_at(b - 1)
+                       _bytepos = b
+                       _position = index
+                       return b
+               end
+               if dpos == 0 then return b
+
+               var ln = _length
                 var pos = _position
                 # Find best insertion point
                 var delta_begin = index
                 var delta_end = (ln - 1) - index
                 var delta_cache = (pos - index).abs
                 var min = delta_begin
-               var its = _items
  
                 if delta_cache < min then min = delta_cache
                 if delta_end < min then min = delta_end
@@ -68,15 +86,15 @@ redef class FlatText
                 var ns_i: Int
                 var my_i: Int
  
-               if min == delta_begin then
-                       ns_i = first_byte
-                       my_i = 0
-               else if min == delta_cache then
+               if min == delta_cache then
                         ns_i = _bytepos
                         my_i = pos
+               else if min == delta_begin then
+                       ns_i = first_byte
+                       my_i = 0
                 else
                         ns_i = its.find_beginning_of_char_at(last_byte)
-                       my_i = length - 1
+                       my_i = _length - 1
                 end
  
                 ns_i = its.char_to_byte_index_cached(index, my_i, ns_i)
@@ -274,10 +292,66 @@ redef class FlatText
                         end
                         pos += 1
                 end
-               return nns.to_s_with_length(nlen)
+               return nns.to_s_unsafe(nlen)
+       end
+
+       redef fun [](index) do
+               var len = _length
+
+               # Statistically:
+               # * ~70% want the next char
+               # * ~23% want the previous
+               # * ~7% want the same char
+               #
+               # So it makes sense to shortcut early. And early is here.
+               var dpos = index - _position
+               var b = _bytepos
+               if dpos == 1 and index < len - 1 then
+                       var its = _items
+                       var c = its[b]
+                       if c & 0x80u8 == 0x00u8 then
+                               # We want the next, and current is easy.
+                               # So next is easy to find!
+                               b += 1
+                               _position = index
+                               _bytepos = b
+                               # The rest will be done by `dpos==0` bellow.
+                               dpos = 0
+                       end
+               else if dpos == -1 and index > 1 then
+                       var its = _items
+                       var c = its[b-1]
+                       if c & 0x80u8 == 0x00u8 then
+                               # We want the previous, and it is easy.
+                               b -= 1
+                               dpos = 0
+                               _position = index
+                               _bytepos = b
+                               return c.ascii
+                       end
+               end
+               if dpos == 0 then
+                       # We know what we want (+0 or +1) just get it now!
+                       var its = _items
+                       var c = its[b]
+                       if c & 0x80u8 == 0x00u8 then return c.ascii
+                       return items.char_at(b)
+               end
+
+               assert index >= 0 and index < len
+               return fetch_char_at(index)
         end
  
-       redef fun [](index) do return _items.char_at(char_to_byte_index(index))
+       # Gets a `Char` at `index` in `self`
+       #
+       # WARNING: Use at your own risks as no bound-checking is done
+       fun fetch_char_at(index: Int): Char do
+               var i = char_to_byte_index(index)
+               var items = _items
+               var b = items[i]
+               if b & 0x80u8 == 0x00u8 then return b.ascii
+               return items.char_at(i)
+       end
  
         # If `self` contains only digits and alpha <= 'f', return the corresponding integer.
         #
@@ -298,7 +372,7 @@ redef class FlatText
  end
  
  # Immutable strings of characters.
-class FlatString
+abstract class FlatString
         super FlatText
         super String
  
@@ -317,11 +391,12 @@ class FlatString
                 return new_items
         end
  
-       redef fun reversed
-       do
+       redef fun reversed do
                 var b = new FlatBuffer.with_capacity(_bytelen + 1)
-               for i in [0 .. _length[.step(-1) do
-                       b.add self[i]
+               var i = _length - 1
+               while i >= 0 do
+                       b.add self.fetch_char_at(i)
+                       i -= 1
                 end
                 var s = b.to_s.as(FlatString)
                 s._length = self._length
@@ -332,24 +407,40 @@ class FlatString
  
         redef fun substring(from, count)
         do
-               assert count >= 0
+               if count <= 0 then return ""
  
                 if from < 0 then
                         count += from
-                       if count < 0 then count = 0
+                       if count <= 0 then return ""
                         from = 0
                 end
  
-               if (count + from) > length then count = length - from
+               var ln = _length
+               if (count + from) > ln then count = ln - from
                 if count <= 0 then return ""
                 var end_index = from + count - 1
+               return substring_impl(from, count, end_index)
+       end
+
+       private fun substring_impl(from, count, end_index: Int): String do
+               var cache = _position
+               var dfrom = (cache - from).abs
+               var dend = (end_index - from).abs
+
+               var bytefrom: Int
+               var byteto: Int
+               if dfrom < dend then
+                       bytefrom = char_to_byte_index(from)
+                       byteto = char_to_byte_index(end_index)
+               else
+                       byteto = char_to_byte_index(end_index)
+                       bytefrom = char_to_byte_index(from)
+               end
  
-               var bytefrom = char_to_byte_index(from)
-               var byteto = char_to_byte_index(end_index)
                 var its = _items
                 byteto += its.length_of_char_at(byteto) - 1
  
-               var s = new FlatString.full(its, byteto - bytefrom + 1, bytefrom, byteto, count)
+               var s = new FlatString.full(its, byteto - bytefrom + 1, bytefrom, count)
                 return s
         end
  
@@ -398,26 +489,21 @@ class FlatString
         #
         # `_items` will be used as is, without copy, to retrieve the characters of the string.
         # Aliasing issues is the responsibility of the caller.
-       private init with_infos(items: NativeString, bytelen, from: Int)
+       private new with_infos(items: NativeString, bytelen, from: Int)
         do
-               self._items = items
-               self._bytelen = bytelen
-               _first_byte = from
-               _bytepos = from
-               _length = _items.utf8_length(_first_byte, last_byte)
+               var len = items.utf8_length(from, bytelen)
+               if bytelen == len then return new ASCIIFlatString.full_data(items, bytelen, from, len)
+               return new UnicodeFlatString.full_data(items, bytelen, from, len)
         end
  
         # Low-level creation of a new string with all the data.
         #
         # `_items` will be used as is, without copy, to retrieve the characters of the string.
         # Aliasing issues is the responsibility of the caller.
-       private init full(items: NativeString, bytelen, from, length: Int)
+       private new full(items: NativeString, bytelen, from, length: Int)
         do
-               self._items = items
-               self._length = length
-               self._bytelen = bytelen
-               _first_byte = from
-               _bytepos = from
+               if bytelen == length then return new ASCIIFlatString.full_data(items, bytelen, from, length)
+               return new UnicodeFlatString.full_data(items, bytelen, from, length)
         end
  
         redef fun ==(other)
@@ -514,7 +600,6 @@ class FlatString
                 return new FlatString.full(ns, new_bytelen, 0, newlen)
         end
  
-
         redef fun hash
         do
                 if hash_cache == null then
@@ -539,6 +624,80 @@ class FlatString
         redef fun substrings do return new FlatSubstringsIter(self)
  end
  
+# Regular Nit UTF-8 strings
+private class UnicodeFlatString
+       super FlatString
+
+       init full_data(items: NativeString, bytelen, from, length: Int) do
+               self._items = items
+               self._length = length
+               self._bytelen = bytelen
+               _first_byte = from
+               _bytepos = from
+       end
+
+       redef fun substring_from(from) do
+               if from >= self._length then return empty
+               if from <= 0 then return self
+               var c = char_to_byte_index(from)
+               var st = c - _first_byte
+               var fln = bytelen - st
+               return new FlatString.full(items, fln, c, _length - from)
+       end
+end
+
+# Special cases of String where all the characters are ASCII-based
+#
+# Optimizes access operations to O(1) complexity.
+private class ASCIIFlatString
+       super FlatString
+
+       init full_data(items: NativeString, bytelen, from, length: Int) do
+               self._items = items
+               self._length = length
+               self._bytelen = bytelen
+               _first_byte = from
+               _bytepos = from
+       end
+
+       redef fun [](idx) do
+               assert idx < _bytelen and idx >= 0
+               return _items[idx + _first_byte].ascii
+       end
+
+       redef fun substring(from, count) do
+               if count <= 0 then return ""
+
+               if from < 0 then
+                       count += from
+                       if count < 0 then return ""
+                       from = 0
+               end
+               var ln = _length
+               if (count + from) > ln then count = ln - from
+               return new ASCIIFlatString.full_data(_items, count, from + _first_byte, count)
+       end
+
+       redef fun reversed do
+               var b = new FlatBuffer.with_capacity(_bytelen + 1)
+               var i = _length - 1
+               while i >= 0 do
+                       b.add self[i]
+                       i -= 1
+               end
+               var s = b.to_s.as(FlatString)
+               return s
+       end
+
+       redef fun char_to_byte_index(index) do return index + _first_byte
+
+       redef fun substring_impl(from, count, end_index) do
+               return new ASCIIFlatString.full_data(_items, count, from + _first_byte, count)
+       end
+
+       redef fun fetch_char_at(i) do return _items[i + _first_byte].ascii
+end
+
  private class FlatStringCharReverseIterator
         super IndexedIterator[Char]
  
@@ -651,10 +810,9 @@ private class FlatStringByteView
         do
                 # Check that the index (+ _first_byte) is not larger than last_byte
                 # In other terms, if the index is valid
-               assert index >= 0
-               var target = self.target
+               var target = _target
+               assert index >= 0 and index < target._bytelen
                 var ind = index + target._first_byte
-               assert ind <= target.last_byte
                 return target._items[ind]
         end
  
@@ -747,7 +905,6 @@ class FlatBuffer
                         lshift_bytes(ip + clen, -size_diff)
                 end
                 _bytelen += size_diff
-               bytepos += size_diff
                 it.set_char_at(ip, item)
         end
  
@@ -765,9 +922,9 @@ class FlatBuffer
  
         redef fun clear do
                 is_dirty = true
-               if written then reset
                 _bytelen = 0
                 _length = 0
+               if written then reset
         end
  
         redef fun empty do return new Buffer
@@ -776,12 +933,13 @@ class FlatBuffer
         do
                 var c = capacity
                 if cap <= c then return
-               while c <= cap do c = c * 2 + 2
+               if c <= 16 then c = 16
+               while c <= cap do c = c * 2
                 # The COW flag can be set at false here, since
                 # it does a copy of the current `Buffer`
                 written = false
                 var bln = _bytelen
-               var a = new NativeString(c+1)
+               var a = new NativeString(c)
                 if bln > 0 then
                         var it = _items
                         if bln > 0 then it.copy_to(a, bln, 0, 0)
@@ -848,7 +1006,7 @@ class FlatBuffer
         init with_capacity(cap: Int)
         do
                 assert cap >= 0
-               _items = new NativeString(cap + 1)
+               _items = new NativeString(cap)
                 capacity = cap
                 _bytelen = 0
         end
@@ -896,6 +1054,21 @@ class FlatBuffer
                 return new FlatBuffer.with_infos(r_items, byte_length, byte_length, count)
         end
  
+       redef fun append_substring_impl(s, from, length) do
+               if length <= 0 then return
+               if not s isa FlatText then
+                       super
+                       return
+               end
+               var bytest = s.char_to_byte_index(from)
+               var bytend = s.char_to_byte_index(from + length - 1)
+               var btln = bytend - bytest + 1
+               enlarge(btln + _bytelen)
+               s._items.copy_to(_items, btln, bytest, _bytelen)
+               _bytelen += btln
+               _length += length
+       end
+
         redef fun reverse
         do
                 written = false
@@ -1070,8 +1243,7 @@ redef class NativeString
                 return to_s_with_length(cstring_length)
         end
  
-       # Returns `self` as a String of `length`.
-       redef fun to_s_with_length(length): FlatString
+       redef fun to_s_with_length(length)
         do
                 assert length >= 0
                 return clean_utf8(length)
@@ -1081,10 +1253,16 @@ redef class NativeString
                 return new FlatString.full(self, bytelen, 0, unilen)
         end
  
-       # Returns `self` as a new String.
-       redef fun to_s_with_copy: FlatString
+       redef fun to_s_unsafe(len) do
+               if len == null then len = cstring_length
+               return new FlatString.with_infos(self, len, 0)
+       end
+
+       redef fun to_s_with_copy do return to_s_with_copy_and_length(cstring_length)
+
+       # Get a `String` from `length` bytes at `self` copied into Nit memory
+       fun to_s_with_copy_and_length(length: Int): String
         do
-               var length = cstring_length
                 var r = clean_utf8(length)
                 if r.items != self then return r
                 var new_self = new NativeString(length + 1)
@@ -1101,8 +1279,23 @@ redef class NativeString
                 var end_length = len
                 var pos = 0
                 var chr_ln = 0
-               while pos < len do
+               var rem = len
+               while rem > 0 do
+                       while rem >= 4 do
+                               var i = fetch_4_chars(pos)
+                               if i & 0x80808080 != 0 then break
+                               pos += 4
+                               chr_ln += 4
+                               rem -= 4
+                       end
+                       if rem == 0 then break
                         var b = self[pos]
+                       if b & 0x80u8 == 0x00u8 then
+                               pos += 1
+                               chr_ln += 1
+                               rem -= 1
+                               continue
+                       end
                         var nxst = length_of_char_at(pos)
                         var ok_st: Bool
                         if nxst == 1 then
@@ -1119,6 +1312,7 @@ redef class NativeString
                                 replacements.add pos
                                 end_length += 2
                                 pos += 1
+                               rem -= 1
                                 chr_ln += 1
                                 continue
                         end
@@ -1141,9 +1335,12 @@ redef class NativeString
                                 end_length += 2
                                 pos += 1
                                 chr_ln += 1
+                               rem -= 1
                                 continue
                         end
-                       pos += c.u8char_len
+                       var clen = c.u8char_len
+                       pos += clen
+                       rem -= clen
                         chr_ln += 1
                 end
                 var ret = self
@@ -1174,6 +1371,10 @@ redef class NativeString
         #
         # Very unsafe, make sure to have room for this char prior to calling this function.
         private fun set_char_at(pos: Int, c: Char) do
+               if c.code_point < 128 then
+                       self[pos] = c.code_point.to_b
+                       return
+               end
                 var ln = c.u8char_len
                 native_set_char(pos, c, ln)
         end
@@ -1204,14 +1405,6 @@ redef class NativeString
  end
  
  redef class Int
-       redef fun to_base(base, signed)
-       do
-               var l = digit_count(base)
-               var s = new FlatBuffer.from(" " * l)
-               fill_buffer(s, base, signed)
-               return s.to_s
-       end
-
         # return displayable int in base 10 and signed
         #
         #     assert 1.to_s            == "1"