X-Git-Url: http://nitlanguage.org diff --git a/lib/core/text/flat.nit b/lib/core/text/flat.nit index 200fdcf..6273609 100644 --- a/lib/core/text/flat.nit +++ b/lib/core/text/flat.nit @@ -55,7 +55,11 @@ redef class FlatText var its = _items if dpos == 1 then - b += _items.length_of_char_at(b) + if its[b] & 0x80u8 == 0x00u8 then + b += 1 + else + b += its.length_of_char_at(b) + end _bytepos = b _position = index return b @@ -221,6 +225,22 @@ redef class FlatText req_esc += 1 else if c == 0x5Cu8 then req_esc += 1 + else if c == 0x3Fu8 then + var j = pos + 1 + if j < length then + var next = its[j] + # We ignore `??'` because it will be escaped as `??\'`. + if + next == 0x21u8 or + next == 0x28u8 or + next == 0x29u8 or + next == 0x2Du8 or + next == 0x2Fu8 or + next == 0x3Cu8 or + next == 0x3Du8 or + next == 0x3Eu8 + then req_esc += 1 + end else if c < 32u8 then req_esc += 3 end @@ -276,6 +296,27 @@ redef class FlatText nns[opos] = 0x5Cu8 nns[opos + 1] = 0x5Cu8 opos += 2 + else if c == 0x3Fu8 then + var j = pos + 1 + if j < length then + var next = its[j] + # We ignore `??'` because it will be escaped as `??\'`. + if + next == 0x21u8 or + next == 0x28u8 or + next == 0x29u8 or + next == 0x2Du8 or + next == 0x2Fu8 or + next == 0x3Cu8 or + next == 0x3Du8 or + next == 0x3Eu8 + then + nns[opos] = 0x5Cu8 + opos += 1 + end + end + nns[opos] = 0x3Fu8 + opos += 1 else if c < 32u8 then nns[opos] = 0x5Cu8 nns[opos + 1] = 0x30u8 @@ -292,7 +333,49 @@ redef class FlatText end redef fun [](index) do - assert index >= 0 and index < _length + var len = _length + + # Statistically: + # * ~70% want the next char + # * ~23% want the previous + # * ~7% want the same char + # + # So it makes sense to shortcut early. And early is here. + var dpos = index - _position + var b = _bytepos + if dpos == 1 and index < len - 1 then + var its = _items + var c = its[b] + if c & 0x80u8 == 0x00u8 then + # We want the next, and current is easy. + # So next is easy to find! + b += 1 + _position = index + _bytepos = b + # The rest will be done by `dpos==0` bellow. + dpos = 0 + end + else if dpos == -1 and index > 1 then + var its = _items + var c = its[b-1] + if c & 0x80u8 == 0x00u8 then + # We want the previous, and it is easy. + b -= 1 + dpos = 0 + _position = index + _bytepos = b + return c.ascii + end + end + if dpos == 0 then + # We know what we want (+0 or +1) just get it now! + var its = _items + var c = its[b] + if c & 0x80u8 == 0x00u8 then return c.ascii + return items.char_at(b) + end + + assert index >= 0 and index < len return fetch_char_at(index) end @@ -323,10 +406,14 @@ redef class FlatText end return res end + + redef fun copy_to_native(dst, n, src_off, dst_off) do + _items.copy_to(dst, n, first_byte + src_off, dst_off) + end end # Immutable strings of characters. -class FlatString +abstract class FlatString super FlatText super String @@ -359,22 +446,13 @@ class FlatString redef fun fast_cstring do return _items.fast_cstring(_first_byte) - redef fun substring_from(from) do - if from >= self._length then return empty - if from <= 0 then return self - var c = char_to_byte_index(from) - var st = c - _first_byte - var fln = bytelen - st - return new FlatString.full(items, fln, c, _length - from) - end - redef fun substring(from, count) do if count <= 0 then return "" if from < 0 then count += from - if count < 0 then return "" + if count <= 0 then return "" from = 0 end @@ -452,26 +530,21 @@ class FlatString # # `_items` will be used as is, without copy, to retrieve the characters of the string. # Aliasing issues is the responsibility of the caller. - private init with_infos(items: NativeString, bytelen, from: Int) + private new with_infos(items: NativeString, bytelen, from: Int) do - self._items = items - self._bytelen = bytelen - _first_byte = from - _bytepos = from - _length = _items.utf8_length(_first_byte, bytelen) + var len = items.utf8_length(from, bytelen) + if bytelen == len then return new ASCIIFlatString.full_data(items, bytelen, from, len) + return new UnicodeFlatString.full_data(items, bytelen, from, len) end # Low-level creation of a new string with all the data. # # `_items` will be used as is, without copy, to retrieve the characters of the string. # Aliasing issues is the responsibility of the caller. - private init full(items: NativeString, bytelen, from, length: Int) + private new full(items: NativeString, bytelen, from, length: Int) do - self._items = items - self._length = length - self._bytelen = bytelen - _first_byte = from - _bytepos = from + if bytelen == length then return new ASCIIFlatString.full_data(items, bytelen, from, length) + return new UnicodeFlatString.full_data(items, bytelen, from, length) end redef fun ==(other) @@ -568,7 +641,6 @@ class FlatString return new FlatString.full(ns, new_bytelen, 0, newlen) end - redef fun hash do if hash_cache == null then @@ -593,6 +665,80 @@ class FlatString redef fun substrings do return new FlatSubstringsIter(self) end +# Regular Nit UTF-8 strings +private class UnicodeFlatString + super FlatString + + init full_data(items: NativeString, bytelen, from, length: Int) do + self._items = items + self._length = length + self._bytelen = bytelen + _first_byte = from + _bytepos = from + end + + redef fun substring_from(from) do + if from >= self._length then return empty + if from <= 0 then return self + var c = char_to_byte_index(from) + var st = c - _first_byte + var fln = bytelen - st + return new FlatString.full(items, fln, c, _length - from) + end +end + +# Special cases of String where all the characters are ASCII-based +# +# Optimizes access operations to O(1) complexity. +private class ASCIIFlatString + super FlatString + + init full_data(items: NativeString, bytelen, from, length: Int) do + self._items = items + self._length = length + self._bytelen = bytelen + _first_byte = from + _bytepos = from + end + + redef fun [](idx) do + assert idx < _bytelen and idx >= 0 + return _items[idx + _first_byte].ascii + end + + redef fun substring(from, count) do + var ln = _length + if count <= 0 then return "" + if (count + from) > ln then count = ln - from + if count <= 0 then return "" + if from < 0 then + count += from + if count <= 0 then return "" + from = 0 + end + return new ASCIIFlatString.full_data(_items, count, from + _first_byte, count) + end + + redef fun reversed do + var b = new FlatBuffer.with_capacity(_bytelen + 1) + var i = _length - 1 + while i >= 0 do + b.add self[i] + i -= 1 + end + var s = b.to_s.as(FlatString) + return s + end + + redef fun char_to_byte_index(index) do return index + _first_byte + + redef fun substring_impl(from, count, end_index) do + return new ASCIIFlatString.full_data(_items, count, from + _first_byte, count) + end + + redef fun fetch_char_at(i) do return _items[i + _first_byte].ascii +end + private class FlatStringCharReverseIterator super IndexedIterator[Char] @@ -817,9 +963,12 @@ class FlatBuffer redef fun clear do is_dirty = true - if written then reset _bytelen = 0 _length = 0 + if written then + _capacity = 16 + reset + end end redef fun empty do return new Buffer @@ -828,12 +977,13 @@ class FlatBuffer do var c = capacity if cap <= c then return - while c <= cap do c = c * 2 + 2 + if c <= 16 then c = 16 + while c <= cap do c = c * 2 # The COW flag can be set at false here, since # it does a copy of the current `Buffer` written = false var bln = _bytelen - var a = new NativeString(c+1) + var a = new NativeString(c) if bln > 0 then var it = _items if bln > 0 then it.copy_to(a, bln, 0, 0) @@ -885,22 +1035,17 @@ class FlatBuffer init from(s: Text) do _items = new NativeString(s.bytelen) - if s isa FlatText then - _items = s._items - else - for i in substrings do i.as(FlatString)._items.copy_to(_items, i._bytelen, 0, 0) - end + for i in s.substrings do i._items.copy_to(_items, i._bytelen, first_byte, 0) _bytelen = s.bytelen _length = s.length _capacity = _bytelen - written = true end # Create a new empty string with a given capacity. init with_capacity(cap: Int) do assert cap >= 0 - _items = new NativeString(cap + 1) + _items = new NativeString(cap) capacity = cap _bytelen = 0 end @@ -948,6 +1093,21 @@ class FlatBuffer return new FlatBuffer.with_infos(r_items, byte_length, byte_length, count) end + redef fun append_substring_impl(s, from, length) do + if length <= 0 then return + if not s isa FlatText then + super + return + end + var bytest = s.char_to_byte_index(from) + var bytend = s.char_to_byte_index(from + length - 1) + var btln = bytend - bytest + 1 + enlarge(btln + _bytelen) + s._items.copy_to(_items, btln, bytest, _bytelen) + _bytelen += btln + _length += length + end + redef fun reverse do written = false @@ -1122,8 +1282,7 @@ redef class NativeString return to_s_with_length(cstring_length) end - # Returns `self` as a String of `length`. - redef fun to_s_with_length(length): FlatString + redef fun to_s_with_length(length) do assert length >= 0 return clean_utf8(length) @@ -1138,10 +1297,11 @@ redef class NativeString return new FlatString.with_infos(self, len, 0) end - # Returns `self` as a new String. - redef fun to_s_with_copy: FlatString + redef fun to_s_with_copy do return to_s_with_copy_and_length(cstring_length) + + # Get a `String` from `length` bytes at `self` copied into Nit memory + fun to_s_with_copy_and_length(length: Int): String do - var length = cstring_length var r = clean_utf8(length) if r.items != self then return r var new_self = new NativeString(length + 1) @@ -1250,48 +1410,29 @@ redef class NativeString # # Very unsafe, make sure to have room for this char prior to calling this function. private fun set_char_at(pos: Int, c: Char) do - if c.code_point < 128 then - self[pos] = c.code_point.to_b + var cp = c.code_point + if cp < 128 then + self[pos] = cp.to_b return end var ln = c.u8char_len - native_set_char(pos, c, ln) - end - - private fun native_set_char(pos: Int, c: Char, ln: Int) `{ - char* dst = self + pos; - switch(ln){ - case 1: - dst[0] = c; - break; - case 2: - dst[0] = 0xC0 | ((c & 0x7C0) >> 6); - dst[1] = 0x80 | (c & 0x3F); - break; - case 3: - dst[0] = 0xE0 | ((c & 0xF000) >> 12); - dst[1] = 0x80 | ((c & 0xFC0) >> 6); - dst[2] = 0x80 | (c & 0x3F); - break; - case 4: - dst[0] = 0xF0 | ((c & 0x1C0000) >> 18); - dst[1] = 0x80 | ((c & 0x3F000) >> 12); - dst[2] = 0x80 | ((c & 0xFC0) >> 6); - dst[3] = 0x80 | (c & 0x3F); - break; - } - `} + if ln == 2 then + self[pos] = (0xC0 | ((cp & 0x7C0) >> 6)).to_b + self[pos + 1] = (0x80 | (cp & 0x3F)).to_b + else if ln == 3 then + self[pos] = (0xE0 | ((cp & 0xF000) >> 12)).to_b + self[pos + 1] = (0x80 | ((cp & 0xFC0) >> 6)).to_b + self[pos + 2] = (0x80 | (cp & 0x3F)).to_b + else if ln == 4 then + self[pos] = (0xF0 | ((cp & 0x1C0000) >> 18)).to_b + self[pos + 1] = (0x80 | ((cp & 0x3F000) >> 12)).to_b + self[pos + 2] = (0x80 | ((cp & 0xFC0) >> 6)).to_b + self[pos + 3] = (0x80 | (cp & 0x3F)).to_b + end + end end redef class Int - redef fun to_base(base, signed) - do - var l = digit_count(base) - var s = new FlatBuffer.from(" " * l) - fill_buffer(s, base, signed) - return s.to_s - end - # return displayable int in base 10 and signed # # assert 1.to_s == "1"