X-Git-Url: http://nitlanguage.org diff --git a/lib/core/text/flat.nit b/lib/core/text/flat.nit index 2343f9f..6273609 100644 --- a/lib/core/text/flat.nit +++ b/lib/core/text/flat.nit @@ -36,28 +36,49 @@ end redef class FlatText - private fun first_byte: Int do return 0 + # First byte of the NativeString + protected fun first_byte: Int do return 0 - private fun last_byte: Int do return bytelen - 1 + # Last byte of the NativeString + protected fun last_byte: Int do return first_byte + _bytelen - 1 # Cache of the latest position (char) explored in the string var position: Int = 0 # Cached position (bytes) in the NativeString underlying the String - var bytepos: Int = first_byte is lateinit + var bytepos: Int = 0 - # Index of the character `index` in `items` - private fun char_to_byte_index(index: Int): Int do - var ln = length - assert index >= 0 - assert index < ln + # Index of the character `index` in `_items` + fun char_to_byte_index(index: Int): Int do + var dpos = index - _position + var b = _bytepos + var its = _items + + if dpos == 1 then + if its[b] & 0x80u8 == 0x00u8 then + b += 1 + else + b += its.length_of_char_at(b) + end + _bytepos = b + _position = index + return b + end + if dpos == -1 then + b = its.find_beginning_of_char_at(b - 1) + _bytepos = b + _position = index + return b + end + if dpos == 0 then return b + var ln = _length + var pos = _position # Find best insertion point var delta_begin = index var delta_end = (ln - 1) - index - var delta_cache = (position - index).abs + var delta_cache = (pos - index).abs var min = delta_begin - var its = items if delta_cache < min then min = delta_cache if delta_end < min then min = delta_end @@ -65,125 +86,402 @@ redef class FlatText var ns_i: Int var my_i: Int - if min == delta_begin then + if min == delta_cache then + ns_i = _bytepos + my_i = pos + else if min == delta_begin then ns_i = first_byte my_i = 0 - else if min == delta_cache then - ns_i = bytepos - my_i = position else ns_i = its.find_beginning_of_char_at(last_byte) - my_i = length - 1 + my_i = _length - 1 end ns_i = its.char_to_byte_index_cached(index, my_i, ns_i) - position = index - bytepos = ns_i + _position = index + _bytepos = ns_i return ns_i end - private fun byte_to_char_index(index: Int): Int do - var ln = bytelen - assert index >= 0 - assert index < bytelen + # By escaping `self` to HTML, how many more bytes will be needed ? + fun chars_to_html_escape: Int do + var its = _items + var max = last_byte + var pos = first_byte + var endlen = 0 + while pos <= max do + var c = its[pos] + if c == 0x3Cu8 then + endlen += 3 + else if c == 0x3Eu8 then + endlen += 3 + else if c == 0x26u8 then + endlen += 4 + else if c == 0x22u8 then + endlen += 4 + else if c == 0x27u8 then + endlen += 4 + else if c == 0x2Fu8 then + endlen += 4 + end + pos += 1 + end + return endlen + end - # Find best insertion point - var delta_begin = index - var delta_end = (ln - 1) - index - var delta_cache = (bytepos - index).abs - var min = delta_begin - var its = items + redef fun html_escape + do + var extra = chars_to_html_escape + if extra == 0 then return to_s + var its = _items + var max = last_byte + var pos = first_byte + var nlen = extra + _bytelen + var nits = new NativeString(nlen) + var outpos = 0 + while pos <= max do + var c = its[pos] + # Special codes: + # Some HTML characters are used as meta-data, they need + # to be replaced by an HTML-Escaped equivalent + # + # * 0x3C (<) => < + # * 0x3E (>) => > + # * 0x26 (&) => & + # * 0x22 (") => " + # * 0x27 (') => ' + # * 0x2F (/) => / + if c == 0x3Cu8 then + nits[outpos] = 0x26u8 + nits[outpos + 1] = 0x6Cu8 + nits[outpos + 2] = 0x74u8 + nits[outpos + 3] = 0x3Bu8 + outpos += 4 + else if c == 0x3Eu8 then + nits[outpos] = 0x26u8 + nits[outpos + 1] = 0x67u8 + nits[outpos + 2] = 0x74u8 + nits[outpos + 3] = 0x3Bu8 + outpos += 4 + else if c == 0x26u8 then + nits[outpos] = 0x26u8 + nits[outpos + 1] = 0x61u8 + nits[outpos + 2] = 0x6Du8 + nits[outpos + 3] = 0x70u8 + nits[outpos + 4] = 0x3Bu8 + outpos += 5 + else if c == 0x22u8 then + nits[outpos] = 0x26u8 + nits[outpos + 1] = 0x23u8 + nits[outpos + 2] = 0x33u8 + nits[outpos + 3] = 0x34u8 + nits[outpos + 4] = 0x3Bu8 + outpos += 5 + else if c == 0x27u8 then + nits[outpos] = 0x26u8 + nits[outpos + 1] = 0x23u8 + nits[outpos + 2] = 0x33u8 + nits[outpos + 3] = 0x39u8 + nits[outpos + 4] = 0x3Bu8 + outpos += 5 + else if c == 0x2Fu8 then + nits[outpos] = 0x26u8 + nits[outpos + 1] = 0x23u8 + nits[outpos + 2] = 0x34u8 + nits[outpos + 3] = 0x37u8 + nits[outpos + 4] = 0x3Bu8 + outpos += 5 + else + nits[outpos] = c + outpos += 1 + end + pos += 1 + end + var s = new FlatString.with_infos(nits, nlen, 0) + return s + end - if delta_cache < min then min = delta_cache - if delta_end < min then min = delta_end + # By escaping `self` to C, how many more bytes will be needed ? + # + # This enables a double-optimization in `escape_to_c` since if this + # method returns 0, then `self` does not need escaping and can be + # returned as-is + fun chars_to_escape_to_c: Int do + var its = _items + var max = last_byte + var pos = first_byte + var req_esc = 0 + while pos <= max do + var c = its[pos] + if c == 0x0Au8 then + req_esc += 1 + else if c == 0x09u8 then + req_esc += 1 + else if c == 0x22u8 then + req_esc += 1 + else if c == 0x27u8 then + req_esc += 1 + else if c == 0x5Cu8 then + req_esc += 1 + else if c == 0x3Fu8 then + var j = pos + 1 + if j < length then + var next = its[j] + # We ignore `??'` because it will be escaped as `??\'`. + if + next == 0x21u8 or + next == 0x28u8 or + next == 0x29u8 or + next == 0x2Du8 or + next == 0x2Fu8 or + next == 0x3Cu8 or + next == 0x3Du8 or + next == 0x3Eu8 + then req_esc += 1 + end + else if c < 32u8 then + req_esc += 3 + end + pos += 1 + end + return req_esc + end - var ns_i: Int - var my_i: Int + redef fun escape_to_c do + var ln_extra = chars_to_escape_to_c + if ln_extra == 0 then return self.to_s + var its = _items + var max = last_byte + var nlen = _bytelen + ln_extra + var nns = new NativeString(nlen) + var pos = first_byte + var opos = 0 + while pos <= max do + var c = its[pos] + # Special codes: + # + # Any byte with value < 32 is a control character + # All their uses will be replaced by their octal + # value in C. + # + # There are two exceptions however: + # + # * 0x09 => \t + # * 0x0A => \n + # + # Aside from the code points above, the following are: + # + # * 0x22 => \" + # * 0x27 => \' + # * 0x5C => \\ + if c == 0x09u8 then + nns[opos] = 0x5Cu8 + nns[opos + 1] = 0x74u8 + opos += 2 + else if c == 0x0Au8 then + nns[opos] = 0x5Cu8 + nns[opos + 1] = 0x6Eu8 + opos += 2 + else if c == 0x22u8 then + nns[opos] = 0x5Cu8 + nns[opos + 1] = 0x22u8 + opos += 2 + else if c == 0x27u8 then + nns[opos] = 0x5Cu8 + nns[opos + 1] = 0x27u8 + opos += 2 + else if c == 0x5Cu8 then + nns[opos] = 0x5Cu8 + nns[opos + 1] = 0x5Cu8 + opos += 2 + else if c == 0x3Fu8 then + var j = pos + 1 + if j < length then + var next = its[j] + # We ignore `??'` because it will be escaped as `??\'`. + if + next == 0x21u8 or + next == 0x28u8 or + next == 0x29u8 or + next == 0x2Du8 or + next == 0x2Fu8 or + next == 0x3Cu8 or + next == 0x3Du8 or + next == 0x3Eu8 + then + nns[opos] = 0x5Cu8 + opos += 1 + end + end + nns[opos] = 0x3Fu8 + opos += 1 + else if c < 32u8 then + nns[opos] = 0x5Cu8 + nns[opos + 1] = 0x30u8 + nns[opos + 2] = ((c & 0x38u8) >> 3) + 0x30u8 + nns[opos + 3] = (c & 0x07u8) + 0x30u8 + opos += 4 + else + nns[opos] = c + opos += 1 + end + pos += 1 + end + return nns.to_s_unsafe(nlen) + end - if min == delta_begin then - ns_i = first_byte - my_i = 0 - else if min == delta_cache then - ns_i = bytepos - my_i = position - else - ns_i = its.find_beginning_of_char_at(last_byte) - my_i = length - 1 + redef fun [](index) do + var len = _length + + # Statistically: + # * ~70% want the next char + # * ~23% want the previous + # * ~7% want the same char + # + # So it makes sense to shortcut early. And early is here. + var dpos = index - _position + var b = _bytepos + if dpos == 1 and index < len - 1 then + var its = _items + var c = its[b] + if c & 0x80u8 == 0x00u8 then + # We want the next, and current is easy. + # So next is easy to find! + b += 1 + _position = index + _bytepos = b + # The rest will be done by `dpos==0` bellow. + dpos = 0 + end + else if dpos == -1 and index > 1 then + var its = _items + var c = its[b-1] + if c & 0x80u8 == 0x00u8 then + # We want the previous, and it is easy. + b -= 1 + dpos = 0 + _position = index + _bytepos = b + return c.ascii + end + end + if dpos == 0 then + # We know what we want (+0 or +1) just get it now! + var its = _items + var c = its[b] + if c & 0x80u8 == 0x00u8 then return c.ascii + return items.char_at(b) end - my_i = its.byte_to_char_index_cached(index, my_i, ns_i) + assert index >= 0 and index < len + return fetch_char_at(index) + end - position = my_i - bytepos = index + # Gets a `Char` at `index` in `self` + # + # WARNING: Use at your own risks as no bound-checking is done + fun fetch_char_at(index: Int): Char do + var i = char_to_byte_index(index) + var items = _items + var b = items[i] + if b & 0x80u8 == 0x00u8 then return b.ascii + return items.char_at(i) + end - return my_i + # If `self` contains only digits and alpha <= 'f', return the corresponding integer. + # + # assert "ff".to_hex == 255 + redef fun to_hex(pos, ln) do + var res = 0 + if pos == null then pos = 0 + if ln == null then ln = length - pos + pos = char_to_byte_index(pos) + var its = _items + var max = pos + ln + for i in [pos .. max[ do + res <<= 4 + res += its[i].ascii.from_hex + end + return res end - redef fun [](index) do return items.char_at(char_to_byte_index(index)) + redef fun copy_to_native(dst, n, src_off, dst_off) do + _items.copy_to(dst, n, first_byte + src_off, dst_off) + end end # Immutable strings of characters. -class FlatString +abstract class FlatString super FlatText super String - # Index at which `self` begins in `items`, inclusively + # Index at which `self` begins in `_items`, inclusively redef var first_byte is noinit - # Index at which `self` ends in `items`, inclusively - redef var last_byte is noinit - redef var chars = new FlatStringCharView(self) is lazy redef var bytes = new FlatStringByteView(self) is lazy - redef var length is lazy do - if bytelen == 0 then return 0 - var st = first_byte - var its = items - var ln = 0 - var lst = last_byte - while st <= lst do - st += its.length_of_char_at(st) - ln += 1 - end - return ln + redef var to_cstring is lazy do + var blen = _bytelen + var new_items = new NativeString(blen + 1) + _items.copy_to(new_items, blen, _first_byte, 0) + new_items[blen] = 0u8 + return new_items end - redef fun reversed - do - var b = new FlatBuffer.with_capacity(bytelen + 1) - for i in [length - 1 .. 0].step(-1) do - b.add self[i] + redef fun reversed do + var b = new FlatBuffer.with_capacity(_bytelen + 1) + var i = _length - 1 + while i >= 0 do + b.add self.fetch_char_at(i) + i -= 1 end var s = b.to_s.as(FlatString) - s.length = self.length + s._length = self._length return s end - redef fun fast_cstring do return items.fast_cstring(first_byte) + redef fun fast_cstring do return _items.fast_cstring(_first_byte) redef fun substring(from, count) do - assert count >= 0 + if count <= 0 then return "" if from < 0 then count += from - if count < 0 then count = 0 + if count <= 0 then return "" from = 0 end - if (count + from) > length then count = length - from + var ln = _length + if (count + from) > ln then count = ln - from if count <= 0 then return "" var end_index = from + count - 1 + return substring_impl(from, count, end_index) + end + + private fun substring_impl(from, count, end_index: Int): String do + var cache = _position + var dfrom = (cache - from).abs + var dend = (end_index - from).abs + + var bytefrom: Int + var byteto: Int + if dfrom < dend then + bytefrom = char_to_byte_index(from) + byteto = char_to_byte_index(end_index) + else + byteto = char_to_byte_index(end_index) + bytefrom = char_to_byte_index(from) + end - var bytefrom = char_to_byte_index(from) - var byteto = char_to_byte_index(end_index) - byteto += items.length_of_char_at(byteto) - 1 + var its = _items + byteto += its.length_of_char_at(byteto) - 1 - var s = new FlatString.full(items, byteto - bytefrom + 1, bytefrom, byteto, count) + var s = new FlatString.full(its, byteto - bytefrom + 1, bytefrom, count) return s end @@ -191,9 +489,9 @@ class FlatString redef fun to_upper do - var outstr = new FlatBuffer.with_capacity(self.bytelen + 1) + var outstr = new FlatBuffer.with_capacity(self._bytelen + 1) - var mylen = length + var mylen = _length var pos = 0 while pos < mylen do @@ -206,9 +504,9 @@ class FlatString redef fun to_lower do - var outstr = new FlatBuffer.with_capacity(self.bytelen + 1) + var outstr = new FlatBuffer.with_capacity(self._bytelen + 1) - var mylen = length + var mylen = _length var pos = 0 while pos < mylen do @@ -230,58 +528,45 @@ class FlatString # Low-level creation of a new string with minimal data. # - # `items` will be used as is, without copy, to retrieve the characters of the string. + # `_items` will be used as is, without copy, to retrieve the characters of the string. # Aliasing issues is the responsibility of the caller. - private init with_infos(items: NativeString, bytelen, from, to: Int) + private new with_infos(items: NativeString, bytelen, from: Int) do - self.items = items - self.bytelen = bytelen - first_byte = from - last_byte = to + var len = items.utf8_length(from, bytelen) + if bytelen == len then return new ASCIIFlatString.full_data(items, bytelen, from, len) + return new UnicodeFlatString.full_data(items, bytelen, from, len) end # Low-level creation of a new string with all the data. # - # `items` will be used as is, without copy, to retrieve the characters of the string. + # `_items` will be used as is, without copy, to retrieve the characters of the string. # Aliasing issues is the responsibility of the caller. - private init full(items: NativeString, bytelen, from, to, length: Int) + private new full(items: NativeString, bytelen, from, length: Int) do - self.items = items - self.length = length - self.bytelen = bytelen - first_byte = from - last_byte = to - end - - redef fun to_cstring do - if real_items != null then return real_items.as(not null) - var new_items = new NativeString(bytelen + 1) - self.items.copy_to(new_items, bytelen, first_byte, 0) - new_items[bytelen] = 0u8 - real_items = new_items - return new_items + if bytelen == length then return new ASCIIFlatString.full_data(items, bytelen, from, length) + return new UnicodeFlatString.full_data(items, bytelen, from, length) end redef fun ==(other) do - if not other isa FlatString then return super + if not other isa FlatText then return super if self.object_id == other.object_id then return true - var my_length = bytelen + var my_length = _bytelen - if other.bytelen != my_length then return false + if other._bytelen != my_length then return false - var my_index = first_byte + var my_index = _first_byte var its_index = other.first_byte var last_iteration = my_index + my_length - var itsitems = other.items - var myitems = self.items + var its_items = other._items + var my_items = self._items while my_index < last_iteration do - if myitems[my_index] != itsitems[its_index] then return false + if my_items[my_index] != its_items[its_index] then return false my_index += 1 its_index += 1 end @@ -291,78 +576,83 @@ class FlatString redef fun <(other) do - if not other isa FlatString then return super + if not other isa FlatText then return super if self.object_id == other.object_id then return false - var my_length = self.bytelen - var its_length = other.bytelen + var myits = _items + var itsits = other._items - var max = if my_length < its_length then my_length else its_length + var mbt = _bytelen + var obt = other.bytelen - var myits = self.bytes - var itsits = other.bytes + var minln = if mbt < obt then mbt else obt + var mst = _first_byte + var ost = other.first_byte - for i in [0 .. max[ do - var my_curr_char = myits[i] - var its_curr_char = itsits[i] + for i in [0 .. minln[ do + var my_curr_char = myits[mst] + var its_curr_char = itsits[ost] - if my_curr_char != its_curr_char then - if my_curr_char < its_curr_char then return true - return false - end + if my_curr_char > its_curr_char then return false + if my_curr_char < its_curr_char then return true + + mst += 1 + ost += 1 end - return my_length < its_length + return mbt < obt end redef fun +(o) do var s = o.to_s var slen = s.bytelen - var mlen = bytelen + var mlen = _bytelen var nlen = mlen + slen - var mits = items - var mifrom = first_byte + var mits = _items + var mifrom = _first_byte if s isa FlatText then - var sits = s.items + var sits = s._items var sifrom = s.first_byte var ns = new NativeString(nlen + 1) mits.copy_to(ns, mlen, mifrom, 0) sits.copy_to(ns, slen, sifrom, mlen) - return new FlatString.full(ns, nlen, 0, nlen - 1, length + o.length) + return new FlatString.full(ns, nlen, 0, _length + o.length) else abort end end redef fun *(i) do - var mybtlen = bytelen + var mybtlen = _bytelen var new_bytelen = mybtlen * i - var mylen = length + var mylen = _length var newlen = mylen * i + var its = _items + var fb = _first_byte var ns = new NativeString(new_bytelen + 1) ns[new_bytelen] = 0u8 var offset = 0 while i > 0 do - items.copy_to(ns, bytelen, first_byte, offset) + its.copy_to(ns, mybtlen, fb, offset) offset += mybtlen i -= 1 end - return new FlatString.full(ns, new_bytelen, 0, new_bytelen - 1, newlen) + return new FlatString.full(ns, new_bytelen, 0, newlen) end - redef fun hash do if hash_cache == null then # djb2 hash algorithm var h = 5381 - var i = first_byte + var i = _first_byte - var myitems = items + var my_items = _items + var max = last_byte - while i <= last_byte do - h = (h << 5) + h + myitems[i].to_i + while i <= max do + h = (h << 5) + h + my_items[i].to_i i += 1 end @@ -375,6 +665,80 @@ class FlatString redef fun substrings do return new FlatSubstringsIter(self) end +# Regular Nit UTF-8 strings +private class UnicodeFlatString + super FlatString + + init full_data(items: NativeString, bytelen, from, length: Int) do + self._items = items + self._length = length + self._bytelen = bytelen + _first_byte = from + _bytepos = from + end + + redef fun substring_from(from) do + if from >= self._length then return empty + if from <= 0 then return self + var c = char_to_byte_index(from) + var st = c - _first_byte + var fln = bytelen - st + return new FlatString.full(items, fln, c, _length - from) + end +end + +# Special cases of String where all the characters are ASCII-based +# +# Optimizes access operations to O(1) complexity. +private class ASCIIFlatString + super FlatString + + init full_data(items: NativeString, bytelen, from, length: Int) do + self._items = items + self._length = length + self._bytelen = bytelen + _first_byte = from + _bytepos = from + end + + redef fun [](idx) do + assert idx < _bytelen and idx >= 0 + return _items[idx + _first_byte].ascii + end + + redef fun substring(from, count) do + var ln = _length + if count <= 0 then return "" + if (count + from) > ln then count = ln - from + if count <= 0 then return "" + if from < 0 then + count += from + if count <= 0 then return "" + from = 0 + end + return new ASCIIFlatString.full_data(_items, count, from + _first_byte, count) + end + + redef fun reversed do + var b = new FlatBuffer.with_capacity(_bytelen + 1) + var i = _length - 1 + while i >= 0 do + b.add self[i] + i -= 1 + end + var s = b.to_s.as(FlatString) + return s + end + + redef fun char_to_byte_index(index) do return index + _first_byte + + redef fun substring_impl(from, count, end_index) do + return new ASCIIFlatString.full_data(_items, count, from + _first_byte, count) + end + + redef fun fetch_char_at(i) do return _items[i + _first_byte].ascii +end + private class FlatStringCharReverseIterator super IndexedIterator[Char] @@ -382,11 +746,6 @@ private class FlatStringCharReverseIterator var curr_pos: Int - init with_pos(tgt: FlatString, pos: Int) - do - init(tgt, pos) - end - redef fun is_ok do return curr_pos >= 0 redef fun item do return target[curr_pos] @@ -402,14 +761,11 @@ private class FlatStringCharIterator var target: FlatString - var max: Int + var max: Int is noautoinit var curr_pos: Int - init with_pos(tgt: FlatString, pos: Int) - do - init(tgt, tgt.length - 1, pos) - end + init do max = target._length - 1 redef fun is_ok do return curr_pos <= max @@ -428,9 +784,9 @@ private class FlatStringCharView redef fun [](index) do return target[index] - redef fun iterator_from(start) do return new FlatStringCharIterator.with_pos(target, start) + redef fun iterator_from(start) do return new FlatStringCharIterator(target, start) - redef fun reverse_iterator_from(start) do return new FlatStringCharReverseIterator.with_pos(target, start) + redef fun reverse_iterator_from(start) do return new FlatStringCharReverseIterator(target, start) end @@ -439,22 +795,24 @@ private class FlatStringByteReverseIterator var target: FlatString - var target_items: NativeString + var target_items: NativeString is noautoinit var curr_pos: Int - init with_pos(tgt: FlatString, pos: Int) + init do - init(tgt, tgt.items, pos + tgt.first_byte) + var tgt = target + target_items = tgt._items + curr_pos += tgt._first_byte end - redef fun is_ok do return curr_pos >= target.first_byte + redef fun is_ok do return curr_pos >= target._first_byte redef fun item do return target_items[curr_pos] redef fun next do curr_pos -= 1 - redef fun index do return curr_pos - target.first_byte + redef fun index do return curr_pos - target._first_byte end @@ -463,13 +821,15 @@ private class FlatStringByteIterator var target: FlatString - var target_items: NativeString + var target_items: NativeString is noautoinit var curr_pos: Int - init with_pos(tgt: FlatString, pos: Int) + init do - init(tgt, tgt.items, pos + tgt.first_byte) + var tgt = target + target_items = tgt._items + curr_pos += tgt._first_byte end redef fun is_ok do return curr_pos <= target.last_byte @@ -478,7 +838,7 @@ private class FlatStringByteIterator redef fun next do curr_pos += 1 - redef fun index do return curr_pos - target.first_byte + redef fun index do return curr_pos - target._first_byte end @@ -489,17 +849,17 @@ private class FlatStringByteView redef fun [](index) do - # Check that the index (+ first_byte) is not larger than last_byte + # Check that the index (+ _first_byte) is not larger than last_byte # In other terms, if the index is valid - assert index >= 0 - var target = self.target - assert (index + target.first_byte) <= target.last_byte - return target.items[index + target.first_byte] + var target = _target + assert index >= 0 and index < target._bytelen + var ind = index + target._first_byte + return target._items[ind] end - redef fun iterator_from(start) do return new FlatStringByteIterator.with_pos(target, start) + redef fun iterator_from(start) do return new FlatStringByteIterator(target, start) - redef fun reverse_iterator_from(start) do return new FlatStringByteReverseIterator.with_pos(target, start) + redef fun reverse_iterator_from(start) do return new FlatStringByteReverseIterator(target, start) end @@ -518,17 +878,16 @@ class FlatBuffer redef var bytes = new FlatBufferByteView(self) is lazy - redef var bytelen = 0 - - redef var length = 0 - private var char_cache: Int = -1 private var byte_cache: Int = -1 private var capacity = 0 - redef fun fast_cstring do return items.fast_cstring(0) + # Real items, used as cache for when to_cstring is called + private var real_items: NativeString is noinit + + redef fun fast_cstring do return _items.fast_cstring(0) redef fun substrings do return new FlatSubstringsIter(self) @@ -538,43 +897,46 @@ class FlatBuffer # the Copy-On-Write flag `written` is set at true. private fun reset do var nns = new NativeString(capacity) - items.copy_to(nns, bytelen, 0, 0) - items = nns + if _bytelen != 0 then _items.copy_to(nns, _bytelen, 0, 0) + _items = nns written = false end # Shifts the content of the buffer by `len` bytes to the right, starting at byte `from` # - # Internal only, does not modify bytelen or length, this is the caller's responsability + # Internal only, does not modify _bytelen or length, this is the caller's responsability private fun rshift_bytes(from: Int, len: Int) do - var oit = items - var nit = items - if bytelen + len > capacity then + var oit = _items + var nit = _items + var bt = _bytelen + if bt + len > capacity then capacity = capacity * 2 + 2 nit = new NativeString(capacity) oit.copy_to(nit, 0, 0, from) end - oit.copy_to(nit, bytelen - from, from, from + len) + oit.copy_to(nit, bt - from, from, from + len) end # Shifts the content of the buffer by `len` bytes to the left, starting at `from` # - # Internal only, does not modify bytelen or length, this is the caller's responsability + # Internal only, does not modify _bytelen or length, this is the caller's responsability private fun lshift_bytes(from: Int, len: Int) do - items.copy_to(items, bytelen - from, from, from - len) + var it = _items + it.copy_to(it, _bytelen - from, from, from - len) end redef fun []=(index, item) do - assert index >= 0 and index <= length + assert index >= 0 and index <= _length if written then reset is_dirty = true - if index == length then + if index == _length then add item return end - var ip = items.char_to_byte_index(index) - var c = items.char_at(ip) + var it = _items + var ip = it.char_to_byte_index(index) + var c = it.char_at(ip) var clen = c.u8char_len var itemlen = item.u8char_len var size_diff = itemlen - clen @@ -583,9 +945,8 @@ class FlatBuffer else if size_diff < 0 then lshift_bytes(ip + clen, -size_diff) end - bytelen += size_diff - bytepos += size_diff - items.set_char_at(ip, item) + _bytelen += size_diff + it.set_char_at(ip, item) end redef fun add(c) @@ -593,17 +954,21 @@ class FlatBuffer if written then reset is_dirty = true var clen = c.u8char_len - enlarge(bytelen + clen) - items.set_char_at(bytelen, c) - bytelen += clen - length += 1 + var bt = _bytelen + enlarge(bt + clen) + _items.set_char_at(bt, c) + _bytelen += clen + _length += 1 end redef fun clear do is_dirty = true - if written then reset - bytelen = 0 - length = 0 + _bytelen = 0 + _length = 0 + if written then + _capacity = 16 + reset + end end redef fun empty do return new Buffer @@ -612,33 +977,40 @@ class FlatBuffer do var c = capacity if cap <= c then return - while c <= cap do c = c * 2 + 2 + if c <= 16 then c = 16 + while c <= cap do c = c * 2 # The COW flag can be set at false here, since # it does a copy of the current `Buffer` written = false - var a = new NativeString(c+1) - if bytelen > 0 then items.copy_to(a, bytelen, 0, 0) - items = a + var bln = _bytelen + var a = new NativeString(c) + if bln > 0 then + var it = _items + if bln > 0 then it.copy_to(a, bln, 0, 0) + end + _items = a capacity = c end redef fun to_s do written = true - if bytelen == 0 then items = new NativeString(1) - return new FlatString.full(items, bytelen, 0, bytelen - 1, length) + var bln = _bytelen + if bln == 0 then _items = new NativeString(1) + return new FlatString.full(_items, bln, 0, _length) end redef fun to_cstring do if is_dirty then - var new_native = new NativeString(bytelen + 1) - new_native[bytelen] = 0u8 - if length > 0 then items.copy_to(new_native, bytelen, 0, 0) + var bln = _bytelen + var new_native = new NativeString(bln + 1) + new_native[bln] = 0u8 + if _length > 0 then _items.copy_to(new_native, bln, 0, 0) real_items = new_native is_dirty = false end - return real_items.as(not null) + return real_items end # Create a new empty string. @@ -646,41 +1018,36 @@ class FlatBuffer # Low-level creation a new buffer with given data. # - # `items` will be used as is, without copy, to store the characters of the buffer. + # `_items` will be used as is, without copy, to store the characters of the buffer. # Aliasing issues is the responsibility of the caller. # - # If `items` is shared, `written` should be set to true after the creation + # If `_items` is shared, `written` should be set to true after the creation # so that a modification will do a copy-on-write. private init with_infos(items: NativeString, capacity, bytelen, length: Int) do - self.items = items + self._items = items self.capacity = capacity - self.bytelen = bytelen - self.length = length + self._bytelen = bytelen + self._length = length end # Create a new string copied from `s`. init from(s: Text) do - items = new NativeString(s.bytelen) - if s isa FlatText then - items = s.items - else - for i in substrings do i.as(FlatString).items.copy_to(items, i.bytelen, 0, 0) - end - bytelen = s.bytelen - length = s.length - capacity = s.bytelen - written = true + _items = new NativeString(s.bytelen) + for i in s.substrings do i._items.copy_to(_items, i._bytelen, first_byte, 0) + _bytelen = s.bytelen + _length = s.length + _capacity = _bytelen end # Create a new empty string with a given capacity. init with_capacity(cap: Int) do assert cap >= 0 - items = new NativeString(cap + 1) + _items = new NativeString(cap) capacity = cap - bytelen = 0 + _bytelen = 0 end redef fun append(s) @@ -688,15 +1055,16 @@ class FlatBuffer if s.is_empty then return is_dirty = true var sl = s.bytelen - enlarge(bytelen + sl) + var nln = _bytelen + sl + enlarge(nln) if s isa FlatText then - s.items.copy_to(items, sl, s.first_byte, bytelen) + s._items.copy_to(_items, sl, s.first_byte, _bytelen) else for i in s.substrings do append i return end - bytelen += sl - length += s.length + _bytelen = nln + _length += s.length end # Copies the content of self in `dest` @@ -713,18 +1081,31 @@ class FlatBuffer do assert count >= 0 if from < 0 then from = 0 - if (from + count) > length then count = length - from - if count != 0 then - var bytefrom = items.char_to_byte_index(from) - var byteto = items.char_to_byte_index(count + from - 1) - byteto += items.char_at(byteto).u8char_len - 1 - var byte_length = byteto - bytefrom + 1 - var r_items = new NativeString(byte_length) - items.copy_to(r_items, byte_length, bytefrom, 0) - return new FlatBuffer.with_infos(r_items, byte_length, byte_length, count) - else - return new Buffer + if (from + count) > _length then count = _length - from + if count <= 0 then return new Buffer + var its = _items + var bytefrom = its.char_to_byte_index(from) + var byteto = its.char_to_byte_index(count + from - 1) + byteto += its.char_at(byteto).u8char_len - 1 + var byte_length = byteto - bytefrom + 1 + var r_items = new NativeString(byte_length) + its.copy_to(r_items, byte_length, bytefrom, 0) + return new FlatBuffer.with_infos(r_items, byte_length, byte_length, count) + end + + redef fun append_substring_impl(s, from, length) do + if length <= 0 then return + if not s isa FlatText then + super + return end + var bytest = s.char_to_byte_index(from) + var bytend = s.char_to_byte_index(from + length - 1) + var btln = bytend - bytest + 1 + enlarge(btln + _bytelen) + s._items.copy_to(_items, btln, bytest, _bytelen) + _bytelen += btln + _length += length end redef fun reverse @@ -732,12 +1113,13 @@ class FlatBuffer written = false var ns = new FlatBuffer.with_capacity(capacity) for i in chars.reverse_iterator do ns.add i - items = ns.items + _items = ns._items end redef fun times(repeats) do - var x = new FlatString.full(items, bytelen, 0, bytelen - 1, length) + var bln = _bytelen + var x = new FlatString.full(_items, bln, 0, _length) for i in [1 .. repeats[ do append(x) end @@ -746,13 +1128,13 @@ class FlatBuffer redef fun upper do if written then reset - for i in [0 .. length[ do self[i] = self[i].to_upper + for i in [0 .. _length[ do self[i] = self[i].to_upper end redef fun lower do if written then reset - for i in [0 .. length[ do self[i] = self[i].to_lower + for i in [0 .. _length[ do self[i] = self[i].to_lower end end @@ -761,14 +1143,11 @@ private class FlatBufferByteReverseIterator var target: FlatBuffer - var target_items: NativeString + var target_items: NativeString is noautoinit var curr_pos: Int - init with_pos(tgt: FlatBuffer, pos: Int) - do - init(tgt, tgt.items, pos) - end + init do target_items = target._items redef fun index do return curr_pos @@ -785,11 +1164,11 @@ private class FlatBufferByteView redef type SELFTYPE: FlatBuffer - redef fun [](index) do return target.items[index] + redef fun [](index) do return target._items[index] - redef fun iterator_from(pos) do return new FlatBufferByteIterator.with_pos(target, pos) + redef fun iterator_from(pos) do return new FlatBufferByteIterator(target, pos) - redef fun reverse_iterator_from(pos) do return new FlatBufferByteReverseIterator.with_pos(target, pos) + redef fun reverse_iterator_from(pos) do return new FlatBufferByteReverseIterator(target, pos) end @@ -798,18 +1177,15 @@ private class FlatBufferByteIterator var target: FlatBuffer - var target_items: NativeString + var target_items: NativeString is noautoinit var curr_pos: Int - init with_pos(tgt: FlatBuffer, pos: Int) - do - init(tgt, tgt.items, pos) - end + init do target_items = target._items redef fun index do return curr_pos - redef fun is_ok do return curr_pos < target.bytelen + redef fun is_ok do return curr_pos < target._bytelen redef fun item do return target_items[curr_pos] @@ -824,11 +1200,6 @@ private class FlatBufferCharReverseIterator var curr_pos: Int - init with_pos(tgt: FlatBuffer, pos: Int) - do - init(tgt, pos) - end - redef fun index do return curr_pos redef fun is_ok do return curr_pos >= 0 @@ -874,13 +1245,13 @@ private class FlatBufferCharView redef fun append(s) do var s_length = s.length - if target.capacity < s.length then enlarge(s_length + target.length) + if target.capacity < s.length then enlarge(s_length + target._length) for i in s do target.add i end - redef fun iterator_from(pos) do return new FlatBufferCharIterator.with_pos(target, pos) + redef fun iterator_from(pos) do return new FlatBufferCharIterator(target, pos) - redef fun reverse_iterator_from(pos) do return new FlatBufferCharReverseIterator.with_pos(target, pos) + redef fun reverse_iterator_from(pos) do return new FlatBufferCharReverseIterator(target, pos) end @@ -889,14 +1260,11 @@ private class FlatBufferCharIterator var target: FlatBuffer - var max: Int + var max: Int is noautoinit var curr_pos: Int - init with_pos(tgt: FlatBuffer, pos: Int) - do - init(tgt, tgt.length - 1, pos) - end + init do max = target._length - 1 redef fun index do return curr_pos @@ -914,72 +1282,157 @@ redef class NativeString return to_s_with_length(cstring_length) end - # Returns `self` as a String of `length`. - redef fun to_s_with_length(length): FlatString + redef fun to_s_with_length(length) do assert length >= 0 - var str = new FlatString.with_infos(self, length, 0, length - 1) - return str + return clean_utf8(length) end redef fun to_s_full(bytelen, unilen) do - return new FlatString.full(self, bytelen, 0, bytelen - 1, unilen) + return new FlatString.full(self, bytelen, 0, unilen) end - # Returns `self` as a new String. - redef fun to_s_with_copy: FlatString + redef fun to_s_unsafe(len) do + if len == null then len = cstring_length + return new FlatString.with_infos(self, len, 0) + end + + redef fun to_s_with_copy do return to_s_with_copy_and_length(cstring_length) + + # Get a `String` from `length` bytes at `self` copied into Nit memory + fun to_s_with_copy_and_length(length: Int): String do - var length = cstring_length + var r = clean_utf8(length) + if r.items != self then return r var new_self = new NativeString(length + 1) copy_to(new_self, length, 0, 0) - var str = new FlatString.with_infos(new_self, length, 0, length - 1) + var str = new FlatString.with_infos(new_self, length, 0) new_self[length] = 0u8 - str.real_items = new_self + str.to_cstring = new_self return str end + # Cleans a NativeString if necessary + fun clean_utf8(len: Int): FlatString do + var replacements: nullable Array[Int] = null + var end_length = len + var pos = 0 + var chr_ln = 0 + var rem = len + while rem > 0 do + while rem >= 4 do + var i = fetch_4_chars(pos) + if i & 0x80808080 != 0 then break + pos += 4 + chr_ln += 4 + rem -= 4 + end + if rem == 0 then break + var b = self[pos] + if b & 0x80u8 == 0x00u8 then + pos += 1 + chr_ln += 1 + rem -= 1 + continue + end + var nxst = length_of_char_at(pos) + var ok_st: Bool + if nxst == 1 then + ok_st = b & 0x80u8 == 0u8 + else if nxst == 2 then + ok_st = b & 0xE0u8 == 0xC0u8 + else if nxst == 3 then + ok_st = b & 0xF0u8 == 0xE0u8 + else + ok_st = b & 0xF8u8 == 0xF0u8 + end + if not ok_st then + if replacements == null then replacements = new Array[Int] + replacements.add pos + end_length += 2 + pos += 1 + rem -= 1 + chr_ln += 1 + continue + end + var ok_c: Bool + var c = char_at(pos) + var cp = c.code_point + if nxst == 1 then + ok_c = cp >= 0 and cp <= 0x7F + else if nxst == 2 then + ok_c = cp >= 0x80 and cp <= 0x7FF + else if nxst == 3 then + ok_c = cp >= 0x800 and cp <= 0xFFFF + ok_c = ok_c and not (cp >= 0xD800 and cp <= 0xDFFF) and cp != 0xFFFE and cp != 0xFFFF + else + ok_c = cp >= 0x10000 and cp <= 0x10FFFF + end + if not ok_c then + if replacements == null then replacements = new Array[Int] + replacements.add pos + end_length += 2 + pos += 1 + chr_ln += 1 + rem -= 1 + continue + end + var clen = c.u8char_len + pos += clen + rem -= clen + chr_ln += 1 + end + var ret = self + if end_length != len then + ret = new NativeString(end_length) + var old_repl = 0 + var off = 0 + var repls = replacements.as(not null) + var r = repls.items.as(not null) + var imax = repls.length + for i in [0 .. imax[ do + var repl_pos = r[i] + var chkln = repl_pos - old_repl + copy_to(ret, chkln, old_repl, off) + off += chkln + ret[off] = 0xEFu8 + ret[off + 1] = 0xBFu8 + ret[off + 2] = 0xBDu8 + old_repl = repl_pos + 1 + off += 3 + end + copy_to(ret, len - old_repl, old_repl, off) + end + return new FlatString.full(ret, end_length, 0, chr_ln) + end + # Sets the next bytes at position `pos` to the value of `c`, encoded in UTF-8 # # Very unsafe, make sure to have room for this char prior to calling this function. private fun set_char_at(pos: Int, c: Char) do + var cp = c.code_point + if cp < 128 then + self[pos] = cp.to_b + return + end var ln = c.u8char_len - native_set_char(pos, c, ln) - end - - private fun native_set_char(pos: Int, c: Char, ln: Int) `{ - char* dst = self + pos; - switch(ln){ - case 1: - dst[0] = c; - break; - case 2: - dst[0] = 0xC0 | ((c & 0x7C0) >> 6); - dst[1] = 0x80 | (c & 0x3F); - break; - case 3: - dst[0] = 0xE0 | ((c & 0xF000) >> 12); - dst[1] = 0x80 | ((c & 0xFC0) >> 6); - dst[2] = 0x80 | (c & 0x3F); - break; - case 4: - dst[0] = 0xF0 | ((c & 0x1C0000) >> 18); - dst[1] = 0x80 | ((c & 0x3F000) >> 12); - dst[2] = 0x80 | ((c & 0xFC0) >> 6); - dst[3] = 0x80 | (c & 0x3F); - break; - } - `} + if ln == 2 then + self[pos] = (0xC0 | ((cp & 0x7C0) >> 6)).to_b + self[pos + 1] = (0x80 | (cp & 0x3F)).to_b + else if ln == 3 then + self[pos] = (0xE0 | ((cp & 0xF000) >> 12)).to_b + self[pos + 1] = (0x80 | ((cp & 0xFC0) >> 6)).to_b + self[pos + 2] = (0x80 | (cp & 0x3F)).to_b + else if ln == 4 then + self[pos] = (0xF0 | ((cp & 0x1C0000) >> 18)).to_b + self[pos + 1] = (0x80 | ((cp & 0x3F000) >> 12)).to_b + self[pos + 2] = (0x80 | ((cp & 0xFC0) >> 6)).to_b + self[pos + 3] = (0x80 | (cp & 0x3F)).to_b + end + end end redef class Int - redef fun to_base(base, signed) - do - var l = digit_count(base) - var s = new FlatBuffer.from(" " * l) - fill_buffer(s, base, signed) - return s.to_s - end - # return displayable int in base 10 and signed # # assert 1.to_s == "1" @@ -993,7 +1446,7 @@ redef class Int var ns = new NativeString(nslen + 1) ns[nslen] = 0u8 native_int_to_s(ns, nslen + 1) - return new FlatString.full(ns, nslen, 0, nslen - 1, nslen) + return new FlatString.full(ns, nslen, 0, nslen) end end @@ -1002,10 +1455,11 @@ redef class Array[E] # Fast implementation redef fun plain_to_s do - var l = length + var l = _length if l == 0 then return "" - if l == 1 then if self[0] == null then return "" else return self[0].to_s - var its = _items + var its = _items.as(not null) + var first = its[0] + if l == 1 then if first == null then return "" else return first.to_s var na = new NativeArray[String](l) var i = 0 var sl = 0 @@ -1029,20 +1483,20 @@ redef class Array[E] while i < mypos do var tmp = na[i] if tmp isa FlatString then - var tpl = tmp.bytelen - tmp.items.copy_to(ns, tpl, tmp.first_byte, off) + var tpl = tmp._bytelen + tmp._items.copy_to(ns, tpl, tmp._first_byte, off) off += tpl else for j in tmp.substrings do var s = j.as(FlatString) - var slen = s.bytelen - s.items.copy_to(ns, slen, s.first_byte, off) + var slen = s._bytelen + s._items.copy_to(ns, slen, s._first_byte, off) off += slen end end i += 1 end - return ns.to_s_with_length(sl) + return new FlatString.with_infos(ns, sl, 0) end end @@ -1066,20 +1520,20 @@ redef class NativeArray[E] while i < mypos do var tmp = na[i] if tmp isa FlatString then - var tpl = tmp.bytelen - tmp.items.copy_to(ns, tpl, tmp.first_byte, off) + var tpl = tmp._bytelen + tmp._items.copy_to(ns, tpl, tmp._first_byte, off) off += tpl else for j in tmp.substrings do var s = j.as(FlatString) - var slen = s.bytelen - s.items.copy_to(ns, slen, s.first_byte, off) + var slen = s._bytelen + s._items.copy_to(ns, slen, s._first_byte, off) off += slen end end i += 1 end - return ns.to_s_with_length(sl) + return new FlatString.with_infos(ns, sl, 0) end end @@ -1096,7 +1550,7 @@ redef class Map[K,V] var e = i.item s.append("{k or else ""}{couple_sep}{e or else ""}") - # Concat other items + # Concat other _items i.next while i.is_ok do s.append(sep)