Merge: Added contributing guidelines and link from readme
[nit.git] / lib / core / text / flat.nit
index 200fdcf..6273609 100644 (file)
@@ -55,7 +55,11 @@ redef class FlatText
                var its = _items
 
                if dpos == 1 then
-                       b += _items.length_of_char_at(b)
+                       if its[b] & 0x80u8 == 0x00u8 then
+                               b += 1
+                       else
+                               b += its.length_of_char_at(b)
+                       end
                        _bytepos = b
                        _position = index
                        return b
@@ -221,6 +225,22 @@ redef class FlatText
                                req_esc += 1
                        else if c == 0x5Cu8 then
                                req_esc += 1
+                       else if c == 0x3Fu8 then
+                               var j = pos + 1
+                               if j < length then
+                                       var next = its[j]
+                                       # We ignore `??'` because it will be escaped as `??\'`.
+                                       if
+                                               next == 0x21u8 or
+                                               next == 0x28u8 or
+                                               next == 0x29u8 or
+                                               next == 0x2Du8 or
+                                               next == 0x2Fu8 or
+                                               next == 0x3Cu8 or
+                                               next == 0x3Du8 or
+                                               next == 0x3Eu8
+                                       then req_esc += 1
+                               end
                        else if c < 32u8 then
                                req_esc += 3
                        end
@@ -276,6 +296,27 @@ redef class FlatText
                                nns[opos] = 0x5Cu8
                                nns[opos + 1] = 0x5Cu8
                                opos += 2
+                       else if c == 0x3Fu8 then
+                               var j = pos + 1
+                               if j < length then
+                                       var next = its[j]
+                                       # We ignore `??'` because it will be escaped as `??\'`.
+                                       if
+                                               next == 0x21u8 or
+                                               next == 0x28u8 or
+                                               next == 0x29u8 or
+                                               next == 0x2Du8 or
+                                               next == 0x2Fu8 or
+                                               next == 0x3Cu8 or
+                                               next == 0x3Du8 or
+                                               next == 0x3Eu8
+                                       then
+                                               nns[opos] = 0x5Cu8
+                                               opos += 1
+                                       end
+                               end
+                               nns[opos] = 0x3Fu8
+                               opos += 1
                        else if c < 32u8 then
                                nns[opos] = 0x5Cu8
                                nns[opos + 1] = 0x30u8
@@ -292,7 +333,49 @@ redef class FlatText
        end
 
        redef fun [](index) do
-               assert index >= 0 and index < _length
+               var len = _length
+
+               # Statistically:
+               # * ~70% want the next char
+               # * ~23% want the previous
+               # * ~7% want the same char
+               #
+               # So it makes sense to shortcut early. And early is here.
+               var dpos = index - _position
+               var b = _bytepos
+               if dpos == 1 and index < len - 1 then
+                       var its = _items
+                       var c = its[b]
+                       if c & 0x80u8 == 0x00u8 then
+                               # We want the next, and current is easy.
+                               # So next is easy to find!
+                               b += 1
+                               _position = index
+                               _bytepos = b
+                               # The rest will be done by `dpos==0` bellow.
+                               dpos = 0
+                       end
+               else if dpos == -1 and index > 1 then
+                       var its = _items
+                       var c = its[b-1]
+                       if c & 0x80u8 == 0x00u8 then
+                               # We want the previous, and it is easy.
+                               b -= 1
+                               dpos = 0
+                               _position = index
+                               _bytepos = b
+                               return c.ascii
+                       end
+               end
+               if dpos == 0 then
+                       # We know what we want (+0 or +1) just get it now!
+                       var its = _items
+                       var c = its[b]
+                       if c & 0x80u8 == 0x00u8 then return c.ascii
+                       return items.char_at(b)
+               end
+
+               assert index >= 0 and index < len
                return fetch_char_at(index)
        end
 
@@ -323,10 +406,14 @@ redef class FlatText
                end
                return res
        end
+
+       redef fun copy_to_native(dst, n, src_off, dst_off) do
+               _items.copy_to(dst, n, first_byte + src_off, dst_off)
+       end
 end
 
 # Immutable strings of characters.
-class FlatString
+abstract class FlatString
        super FlatText
        super String
 
@@ -359,22 +446,13 @@ class FlatString
 
        redef fun fast_cstring do return _items.fast_cstring(_first_byte)
 
-       redef fun substring_from(from) do
-               if from >= self._length then return empty
-               if from <= 0 then return self
-               var c = char_to_byte_index(from)
-               var st = c - _first_byte
-               var fln = bytelen - st
-               return new FlatString.full(items, fln, c, _length - from)
-       end
-
        redef fun substring(from, count)
        do
                if count <= 0 then return ""
 
                if from < 0 then
                        count += from
-                       if count < 0 then return ""
+                       if count <= 0 then return ""
                        from = 0
                end
 
@@ -452,26 +530,21 @@ class FlatString
        #
        # `_items` will be used as is, without copy, to retrieve the characters of the string.
        # Aliasing issues is the responsibility of the caller.
-       private init with_infos(items: NativeString, bytelen, from: Int)
+       private new with_infos(items: NativeString, bytelen, from: Int)
        do
-               self._items = items
-               self._bytelen = bytelen
-               _first_byte = from
-               _bytepos = from
-               _length = _items.utf8_length(_first_byte, bytelen)
+               var len = items.utf8_length(from, bytelen)
+               if bytelen == len then return new ASCIIFlatString.full_data(items, bytelen, from, len)
+               return new UnicodeFlatString.full_data(items, bytelen, from, len)
        end
 
        # Low-level creation of a new string with all the data.
        #
        # `_items` will be used as is, without copy, to retrieve the characters of the string.
        # Aliasing issues is the responsibility of the caller.
-       private init full(items: NativeString, bytelen, from, length: Int)
+       private new full(items: NativeString, bytelen, from, length: Int)
        do
-               self._items = items
-               self._length = length
-               self._bytelen = bytelen
-               _first_byte = from
-               _bytepos = from
+               if bytelen == length then return new ASCIIFlatString.full_data(items, bytelen, from, length)
+               return new UnicodeFlatString.full_data(items, bytelen, from, length)
        end
 
        redef fun ==(other)
@@ -568,7 +641,6 @@ class FlatString
                return new FlatString.full(ns, new_bytelen, 0, newlen)
        end
 
-
        redef fun hash
        do
                if hash_cache == null then
@@ -593,6 +665,80 @@ class FlatString
        redef fun substrings do return new FlatSubstringsIter(self)
 end
 
+# Regular Nit UTF-8 strings
+private class UnicodeFlatString
+       super FlatString
+
+       init full_data(items: NativeString, bytelen, from, length: Int) do
+               self._items = items
+               self._length = length
+               self._bytelen = bytelen
+               _first_byte = from
+               _bytepos = from
+       end
+
+       redef fun substring_from(from) do
+               if from >= self._length then return empty
+               if from <= 0 then return self
+               var c = char_to_byte_index(from)
+               var st = c - _first_byte
+               var fln = bytelen - st
+               return new FlatString.full(items, fln, c, _length - from)
+       end
+end
+
+# Special cases of String where all the characters are ASCII-based
+#
+# Optimizes access operations to O(1) complexity.
+private class ASCIIFlatString
+       super FlatString
+
+       init full_data(items: NativeString, bytelen, from, length: Int) do
+               self._items = items
+               self._length = length
+               self._bytelen = bytelen
+               _first_byte = from
+               _bytepos = from
+       end
+
+       redef fun [](idx) do
+               assert idx < _bytelen and idx >= 0
+               return _items[idx + _first_byte].ascii
+       end
+
+       redef fun substring(from, count) do
+               var ln = _length
+               if count <= 0 then return ""
+               if (count + from) > ln then count = ln - from
+               if count <= 0 then return ""
+               if from < 0 then
+                       count += from
+                       if count <= 0 then return ""
+                       from = 0
+               end
+               return new ASCIIFlatString.full_data(_items, count, from + _first_byte, count)
+       end
+
+       redef fun reversed do
+               var b = new FlatBuffer.with_capacity(_bytelen + 1)
+               var i = _length - 1
+               while i >= 0 do
+                       b.add self[i]
+                       i -= 1
+               end
+               var s = b.to_s.as(FlatString)
+               return s
+       end
+
+       redef fun char_to_byte_index(index) do return index + _first_byte
+
+       redef fun substring_impl(from, count, end_index) do
+               return new ASCIIFlatString.full_data(_items, count, from + _first_byte, count)
+       end
+
+       redef fun fetch_char_at(i) do return _items[i + _first_byte].ascii
+end
+
 private class FlatStringCharReverseIterator
        super IndexedIterator[Char]
 
@@ -817,9 +963,12 @@ class FlatBuffer
 
        redef fun clear do
                is_dirty = true
-               if written then reset
                _bytelen = 0
                _length = 0
+               if written then
+                       _capacity = 16
+                       reset
+               end
        end
 
        redef fun empty do return new Buffer
@@ -828,12 +977,13 @@ class FlatBuffer
        do
                var c = capacity
                if cap <= c then return
-               while c <= cap do c = c * 2 + 2
+               if c <= 16 then c = 16
+               while c <= cap do c = c * 2
                # The COW flag can be set at false here, since
                # it does a copy of the current `Buffer`
                written = false
                var bln = _bytelen
-               var a = new NativeString(c+1)
+               var a = new NativeString(c)
                if bln > 0 then
                        var it = _items
                        if bln > 0 then it.copy_to(a, bln, 0, 0)
@@ -885,22 +1035,17 @@ class FlatBuffer
        init from(s: Text)
        do
                _items = new NativeString(s.bytelen)
-               if s isa FlatText then
-                       _items = s._items
-               else
-                       for i in substrings do i.as(FlatString)._items.copy_to(_items, i._bytelen, 0, 0)
-               end
+               for i in s.substrings do i._items.copy_to(_items, i._bytelen, first_byte, 0)
                _bytelen = s.bytelen
                _length = s.length
                _capacity = _bytelen
-               written = true
        end
 
        # Create a new empty string with a given capacity.
        init with_capacity(cap: Int)
        do
                assert cap >= 0
-               _items = new NativeString(cap + 1)
+               _items = new NativeString(cap)
                capacity = cap
                _bytelen = 0
        end
@@ -948,6 +1093,21 @@ class FlatBuffer
                return new FlatBuffer.with_infos(r_items, byte_length, byte_length, count)
        end
 
+       redef fun append_substring_impl(s, from, length) do
+               if length <= 0 then return
+               if not s isa FlatText then
+                       super
+                       return
+               end
+               var bytest = s.char_to_byte_index(from)
+               var bytend = s.char_to_byte_index(from + length - 1)
+               var btln = bytend - bytest + 1
+               enlarge(btln + _bytelen)
+               s._items.copy_to(_items, btln, bytest, _bytelen)
+               _bytelen += btln
+               _length += length
+       end
+
        redef fun reverse
        do
                written = false
@@ -1122,8 +1282,7 @@ redef class NativeString
                return to_s_with_length(cstring_length)
        end
 
-       # Returns `self` as a String of `length`.
-       redef fun to_s_with_length(length): FlatString
+       redef fun to_s_with_length(length)
        do
                assert length >= 0
                return clean_utf8(length)
@@ -1138,10 +1297,11 @@ redef class NativeString
                return new FlatString.with_infos(self, len, 0)
        end
 
-       # Returns `self` as a new String.
-       redef fun to_s_with_copy: FlatString
+       redef fun to_s_with_copy do return to_s_with_copy_and_length(cstring_length)
+
+       # Get a `String` from `length` bytes at `self` copied into Nit memory
+       fun to_s_with_copy_and_length(length: Int): String
        do
-               var length = cstring_length
                var r = clean_utf8(length)
                if r.items != self then return r
                var new_self = new NativeString(length + 1)
@@ -1250,48 +1410,29 @@ redef class NativeString
        #
        # Very unsafe, make sure to have room for this char prior to calling this function.
        private fun set_char_at(pos: Int, c: Char) do
-               if c.code_point < 128 then
-                       self[pos] = c.code_point.to_b
+               var cp = c.code_point
+               if cp < 128 then
+                       self[pos] = cp.to_b
                        return
                end
                var ln = c.u8char_len
-               native_set_char(pos, c, ln)
-       end
-
-       private fun native_set_char(pos: Int, c: Char, ln: Int) `{
-               char* dst = self + pos;
-               switch(ln){
-                       case 1:
-                               dst[0] = c;
-                               break;
-                       case 2:
-                               dst[0] = 0xC0 | ((c & 0x7C0) >> 6);
-                               dst[1] = 0x80 | (c & 0x3F);
-                               break;
-                       case 3:
-                               dst[0] = 0xE0 | ((c & 0xF000) >> 12);
-                               dst[1] = 0x80 | ((c & 0xFC0) >> 6);
-                               dst[2] = 0x80 | (c & 0x3F);
-                               break;
-                       case 4:
-                               dst[0] = 0xF0 | ((c & 0x1C0000) >> 18);
-                               dst[1] = 0x80 | ((c & 0x3F000) >> 12);
-                               dst[2] = 0x80 | ((c & 0xFC0) >> 6);
-                               dst[3] = 0x80 | (c & 0x3F);
-                               break;
-               }
-       `}
+               if ln == 2 then
+                       self[pos] = (0xC0 | ((cp & 0x7C0) >> 6)).to_b
+                       self[pos + 1] = (0x80 | (cp & 0x3F)).to_b
+               else if ln == 3 then
+                       self[pos] = (0xE0 | ((cp & 0xF000) >> 12)).to_b
+                       self[pos + 1] = (0x80 | ((cp & 0xFC0) >> 6)).to_b
+                       self[pos + 2] = (0x80 | (cp & 0x3F)).to_b
+               else if ln == 4 then
+                       self[pos] = (0xF0 | ((cp & 0x1C0000) >> 18)).to_b
+                       self[pos + 1] = (0x80 | ((cp & 0x3F000) >> 12)).to_b
+                       self[pos + 2] = (0x80 | ((cp & 0xFC0) >> 6)).to_b
+                       self[pos + 3] = (0x80 | (cp & 0x3F)).to_b
+               end
+       end
 end
 
 redef class Int
-       redef fun to_base(base, signed)
-       do
-               var l = digit_count(base)
-               var s = new FlatBuffer.from(" " * l)
-               fill_buffer(s, base, signed)
-               return s.to_s
-       end
-
        # return displayable int in base 10 and signed
        #
        #     assert 1.to_s            == "1"