private fun last_byte: Int do return _bytelen - 1
# Cache of the latest position (char) explored in the string
- var position: Int = 0
+ private var position: Int = 0
# Cached position (bytes) in the NativeString underlying the String
- var bytepos: Int = first_byte is lateinit
+ private var bytepos: Int = 0
# Index of the character `index` in `_items`
private fun char_to_byte_index(index: Int): Int do
return ns_i
end
- private fun byte_to_char_index(index: Int): Int do
- var ln = _bytelen
- assert index >= 0
- assert index < ln
-
- var pos = _bytepos
- # Find best insertion point
- var delta_begin = index
- var delta_end = (ln - 1) - index
- var delta_cache = (pos - index).abs
- var min = delta_begin
+ # By escaping `self` to C, how many more bytes will be needed ?
+ #
+ # This enables a double-optimization in `escape_to_c` since if this
+ # method returns 0, then `self` does not need escaping and can be
+ # returned as-is
+ protected fun chars_to_escape_to_c: Int do
var its = _items
-
- if delta_cache < min then min = delta_cache
- if delta_end < min then min = delta_end
-
- var ns_i: Int
- var my_i: Int
-
- if min == delta_begin then
- ns_i = first_byte
- my_i = 0
- else if min == delta_cache then
- ns_i = pos
- my_i = _position
- else
- ns_i = its.find_beginning_of_char_at(last_byte)
- my_i = length - 1
+ var max = last_byte
+ var pos = first_byte
+ var req_esc = 0
+ while pos <= max do
+ var c = its[pos]
+ if c == 0x0Au8 then
+ req_esc += 1
+ else if c == 0x09u8 then
+ req_esc += 1
+ else if c == 0x22u8 then
+ req_esc += 1
+ else if c == 0x27u8 then
+ req_esc += 1
+ else if c == 0x5Cu8 then
+ req_esc += 1
+ else if c < 32u8 then
+ req_esc += 3
+ end
+ pos += 1
end
+ return req_esc
+ end
- my_i = its.byte_to_char_index_cached(index, my_i, ns_i)
-
- _position = my_i
- _bytepos = index
-
- return my_i
+ redef fun escape_to_c do
+ var ln_extra = chars_to_escape_to_c
+ if ln_extra == 0 then return self.to_s
+ var its = _items
+ var max = last_byte
+ var nlen = _bytelen + ln_extra
+ var nns = new NativeString(nlen)
+ var pos = first_byte
+ var opos = 0
+ while pos <= max do
+ var c = its[pos]
+ # Special codes:
+ #
+ # Any byte with value < 32 is a control character
+ # All their uses will be replaced by their octal
+ # value in C.
+ #
+ # There are two exceptions however:
+ #
+ # * 0x09 => \t
+ # * 0x0A => \n
+ #
+ # Aside from the code points above, the following are:
+ #
+ # * 0x22 => \"
+ # * 0x27 => \'
+ # * 0x5C => \\
+ if c == 0x09u8 then
+ nns[opos] = 0x5Cu8
+ nns[opos + 1] = 0x74u8
+ opos += 2
+ else if c == 0x0Au8 then
+ nns[opos] = 0x5Cu8
+ nns[opos + 1] = 0x6Eu8
+ opos += 2
+ else if c == 0x22u8 then
+ nns[opos] = 0x5Cu8
+ nns[opos + 1] = 0x22u8
+ opos += 2
+ else if c == 0x27u8 then
+ nns[opos] = 0x5Cu8
+ nns[opos + 1] = 0x27u8
+ opos += 2
+ else if c == 0x5Cu8 then
+ nns[opos] = 0x5Cu8
+ nns[opos + 1] = 0x5Cu8
+ opos += 2
+ else if c < 32u8 then
+ nns[opos] = 0x5Cu8
+ nns[opos + 1] = 0x30u8
+ nns[opos + 2] = ((c & 0x38u8) >> 3) + 0x30u8
+ nns[opos + 3] = (c & 0x07u8) + 0x30u8
+ opos += 4
+ else
+ nns[opos] = c
+ opos += 1
+ end
+ pos += 1
+ end
+ return nns.to_s_with_length(nlen)
end
redef fun [](index) do return _items.char_at(char_to_byte_index(index))
redef var length is lazy do
if _bytelen == 0 then return 0
- var st = _first_byte
- var its = _items
- var ln = 0
- var lst = _last_byte
- while st <= lst do
- st += its.length_of_char_at(st)
- ln += 1
- end
- return ln
+ return _items.utf8_length(_first_byte, _last_byte)
+ end
+
+ redef var to_cstring is lazy do
+ var blen = _bytelen
+ var new_items = new NativeString(blen + 1)
+ _items.copy_to(new_items, blen, _first_byte, 0)
+ new_items[blen] = 0u8
+ return new_items
end
redef fun reversed
self._bytelen = bytelen
_first_byte = from
_last_byte = to
+ _bytepos = from
end
# Low-level creation of a new string with all the data.
self._bytelen = bytelen
_first_byte = from
_last_byte = to
- end
-
- redef fun to_cstring do
- if real_items != null then return real_items.as(not null)
- var blen = _bytelen
- var new_items = new NativeString(blen + 1)
- _items.copy_to(new_items, blen, _first_byte, 0)
- new_items[blen] = 0u8
- real_items = new_items
- return new_items
+ _bytepos = from
end
redef fun ==(other)
private var capacity = 0
+ # Real items, used as cache for when to_cstring is called
+ private var real_items: NativeString is noinit
+
redef fun fast_cstring do return _items.fast_cstring(0)
redef fun substrings do return new FlatSubstringsIter(self)
real_items = new_native
is_dirty = false
end
- return real_items.as(not null)
+ return real_items
end
# Create a new empty string.
redef fun to_s_with_length(length): FlatString
do
assert length >= 0
- var str = new FlatString.with_infos(self, length, 0, length - 1)
- return str
+ return clean_utf8(length)
end
redef fun to_s_full(bytelen, unilen) do
redef fun to_s_with_copy: FlatString
do
var length = cstring_length
+ var r = clean_utf8(length)
+ if r.items != self then return r
var new_self = new NativeString(length + 1)
copy_to(new_self, length, 0, 0)
var str = new FlatString.with_infos(new_self, length, 0, length - 1)
new_self[length] = 0u8
- str.real_items = new_self
+ str.to_cstring = new_self
return str
end
+ # Cleans a NativeString if necessary
+ fun clean_utf8(len: Int): FlatString do
+ var replacements: nullable Array[Int] = null
+ var end_length = len
+ var pos = 0
+ var chr_ln = 0
+ while pos < len do
+ var b = self[pos]
+ var nxst = length_of_char_at(pos)
+ var ok_st: Bool
+ if nxst == 1 then
+ ok_st = b & 0x80u8 == 0u8
+ else if nxst == 2 then
+ ok_st = b & 0xE0u8 == 0xC0u8
+ else if nxst == 3 then
+ ok_st = b & 0xF0u8 == 0xE0u8
+ else
+ ok_st = b & 0xF8u8 == 0xF0u8
+ end
+ if not ok_st then
+ if replacements == null then replacements = new Array[Int]
+ replacements.add pos
+ end_length += 2
+ pos += 1
+ chr_ln += 1
+ continue
+ end
+ var ok_c: Bool
+ var c = char_at(pos)
+ var cp = c.code_point
+ if nxst == 1 then
+ ok_c = cp >= 0 and cp <= 0x7F
+ else if nxst == 2 then
+ ok_c = cp >= 0x80 and cp <= 0x7FF
+ else if nxst == 3 then
+ ok_c = cp >= 0x800 and cp <= 0xFFFF
+ ok_c = ok_c and not (cp >= 0xD800 and cp <= 0xDFFF) and cp != 0xFFFE and cp != 0xFFFF
+ else
+ ok_c = cp >= 0x10000 and cp <= 0x10FFFF
+ end
+ if not ok_c then
+ if replacements == null then replacements = new Array[Int]
+ replacements.add pos
+ end_length += 2
+ pos += 1
+ chr_ln += 1
+ continue
+ end
+ pos += c.u8char_len
+ chr_ln += 1
+ end
+ var ret = self
+ if end_length != len then
+ ret = new NativeString(end_length)
+ var old_repl = 0
+ var off = 0
+ var repls = replacements.as(not null)
+ var r = repls.items.as(not null)
+ var imax = repls.length
+ for i in [0 .. imax[ do
+ var repl_pos = r[i]
+ var chkln = repl_pos - old_repl
+ copy_to(ret, chkln, old_repl, off)
+ off += chkln
+ ret[off] = 0xEFu8
+ ret[off + 1] = 0xBFu8
+ ret[off + 2] = 0xBDu8
+ old_repl = repl_pos + 1
+ off += 3
+ end
+ copy_to(ret, len - old_repl, old_repl, off)
+ end
+ return new FlatString.full(ret, end_length, 0, end_length - 1, chr_ln)
+ end
+
# Sets the next bytes at position `pos` to the value of `c`, encoded in UTF-8
#
# Very unsafe, make sure to have room for this char prior to calling this function.
end
i += 1
end
- return ns.to_s_with_length(sl)
+ return new FlatString.with_infos(ns, sl, 0, sl - 1)
end
end
end
i += 1
end
- return ns.to_s_with_length(sl)
+ return new FlatString.with_infos(ns, sl, 0, sl - 1)
end
end