Merge: Basename fix
[nit.git] / lib / core / text / flat.nit
index 5c96bb7..5bc76fc 100644 (file)
@@ -41,10 +41,10 @@ redef class FlatText
        private fun last_byte: Int do return _bytelen - 1
 
        # Cache of the latest position (char) explored in the string
-       var position: Int = 0
+       private var position: Int = 0
 
        # Cached position (bytes) in the NativeString underlying the String
-       var bytepos: Int = first_byte is lateinit
+       private var bytepos: Int = 0
 
        # Index of the character `index` in `_items`
        private fun char_to_byte_index(index: Int): Int do
@@ -85,42 +85,96 @@ redef class FlatText
                return ns_i
        end
 
-       private fun byte_to_char_index(index: Int): Int do
-               var ln = _bytelen
-               assert index >= 0
-               assert index < ln
-
-               var pos = _bytepos
-               # Find best insertion point
-               var delta_begin = index
-               var delta_end = (ln - 1) - index
-               var delta_cache = (pos - index).abs
-               var min = delta_begin
+       # By escaping `self` to C, how many more bytes will be needed ?
+       #
+       # This enables a double-optimization in `escape_to_c` since if this
+       # method returns 0, then `self` does not need escaping and can be
+       # returned as-is
+       protected fun chars_to_escape_to_c: Int do
                var its = _items
-
-               if delta_cache < min then min = delta_cache
-               if delta_end < min then min = delta_end
-
-               var ns_i: Int
-               var my_i: Int
-
-               if min == delta_begin then
-                       ns_i = first_byte
-                       my_i = 0
-               else if min == delta_cache then
-                       ns_i = pos
-                       my_i = _position
-               else
-                       ns_i = its.find_beginning_of_char_at(last_byte)
-                       my_i = length - 1
+               var max = last_byte
+               var pos = first_byte
+               var req_esc = 0
+               while pos <= max do
+                       var c = its[pos]
+                       if c == 0x0Au8 then
+                               req_esc += 1
+                       else if c == 0x09u8 then
+                               req_esc += 1
+                       else if c == 0x22u8 then
+                               req_esc += 1
+                       else if c == 0x27u8 then
+                               req_esc += 1
+                       else if c == 0x5Cu8 then
+                               req_esc += 1
+                       else if c < 32u8 then
+                               req_esc += 3
+                       end
+                       pos += 1
                end
+               return req_esc
+       end
 
-               my_i = its.byte_to_char_index_cached(index, my_i, ns_i)
-
-               _position = my_i
-               _bytepos = index
-
-               return my_i
+       redef fun escape_to_c do
+               var ln_extra = chars_to_escape_to_c
+               if ln_extra == 0 then return self.to_s
+               var its = _items
+               var max = last_byte
+               var nlen = _bytelen + ln_extra
+               var nns = new NativeString(nlen)
+               var pos = first_byte
+               var opos = 0
+               while pos <= max do
+                       var c = its[pos]
+                       # Special codes:
+                       #
+                       # Any byte with value < 32 is a control character
+                       # All their uses will be replaced by their octal
+                       # value in C.
+                       #
+                       # There are two exceptions however:
+                       #
+                       # * 0x09 => \t
+                       # * 0x0A => \n
+                       #
+                       # Aside from the code points above, the following are:
+                       #
+                       # * 0x22 => \"
+                       # * 0x27 => \'
+                       # * 0x5C => \\
+                       if c == 0x09u8 then
+                               nns[opos] = 0x5Cu8
+                               nns[opos + 1] = 0x74u8
+                               opos += 2
+                       else if c == 0x0Au8 then
+                               nns[opos] = 0x5Cu8
+                               nns[opos + 1] = 0x6Eu8
+                               opos += 2
+                       else if c == 0x22u8 then
+                               nns[opos] = 0x5Cu8
+                               nns[opos + 1] = 0x22u8
+                               opos += 2
+                       else if c == 0x27u8 then
+                               nns[opos] = 0x5Cu8
+                               nns[opos + 1] = 0x27u8
+                               opos += 2
+                       else if c == 0x5Cu8 then
+                               nns[opos] = 0x5Cu8
+                               nns[opos + 1] = 0x5Cu8
+                               opos += 2
+                       else if c < 32u8 then
+                               nns[opos] = 0x5Cu8
+                               nns[opos + 1] = 0x30u8
+                               nns[opos + 2] = ((c & 0x38u8) >> 3) + 0x30u8
+                               nns[opos + 3] = (c & 0x07u8) + 0x30u8
+                               opos += 4
+                       else
+                               nns[opos] = c
+                               opos += 1
+                       end
+                       pos += 1
+               end
+               return nns.to_s_with_length(nlen)
        end
 
        redef fun [](index) do return _items.char_at(char_to_byte_index(index))
@@ -143,15 +197,15 @@ class FlatString
 
        redef var length is lazy do
                if _bytelen == 0 then return 0
-               var st = _first_byte
-               var its = _items
-               var ln = 0
-               var lst = _last_byte
-               while st <= lst do
-                       st += its.length_of_char_at(st)
-                       ln += 1
-               end
-               return ln
+               return _items.utf8_length(_first_byte, _last_byte)
+       end
+
+       redef var to_cstring is lazy do
+               var blen = _bytelen
+               var new_items = new NativeString(blen + 1)
+               _items.copy_to(new_items, blen, _first_byte, 0)
+               new_items[blen] = 0u8
+               return new_items
        end
 
        redef fun reversed
@@ -241,6 +295,7 @@ class FlatString
                self._bytelen = bytelen
                _first_byte = from
                _last_byte = to
+               _bytepos = from
        end
 
        # Low-level creation of a new string with all the data.
@@ -254,16 +309,7 @@ class FlatString
                self._bytelen = bytelen
                _first_byte = from
                _last_byte = to
-       end
-
-       redef fun to_cstring do
-               if real_items != null then return real_items.as(not null)
-               var blen = _bytelen
-               var new_items = new NativeString(blen + 1)
-               _items.copy_to(new_items, blen, _first_byte, 0)
-               new_items[blen] = 0u8
-               real_items = new_items
-               return new_items
+               _bytepos = from
        end
 
        redef fun ==(other)
@@ -534,6 +580,9 @@ class FlatBuffer
 
        private var capacity = 0
 
+       # Real items, used as cache for when to_cstring is called
+       private var real_items: NativeString is noinit
+
        redef fun fast_cstring do return _items.fast_cstring(0)
 
        redef fun substrings do return new FlatSubstringsIter(self)
@@ -654,7 +703,7 @@ class FlatBuffer
                        real_items = new_native
                        is_dirty = false
                end
-               return real_items.as(not null)
+               return real_items
        end
 
        # Create a new empty string.
@@ -937,8 +986,7 @@ redef class NativeString
        redef fun to_s_with_length(length): FlatString
        do
                assert length >= 0
-               var str = new FlatString.with_infos(self, length, 0, length - 1)
-               return str
+               return clean_utf8(length)
        end
 
        redef fun to_s_full(bytelen, unilen) do
@@ -949,14 +997,91 @@ redef class NativeString
        redef fun to_s_with_copy: FlatString
        do
                var length = cstring_length
+               var r = clean_utf8(length)
+               if r.items != self then return r
                var new_self = new NativeString(length + 1)
                copy_to(new_self, length, 0, 0)
                var str = new FlatString.with_infos(new_self, length, 0, length - 1)
                new_self[length] = 0u8
-               str.real_items = new_self
+               str.to_cstring = new_self
                return str
        end
 
+       # Cleans a NativeString if necessary
+       fun clean_utf8(len: Int): FlatString do
+               var replacements: nullable Array[Int] = null
+               var end_length = len
+               var pos = 0
+               var chr_ln = 0
+               while pos < len do
+                       var b = self[pos]
+                       var nxst = length_of_char_at(pos)
+                       var ok_st: Bool
+                       if nxst == 1 then
+                               ok_st = b & 0x80u8 == 0u8
+                       else if nxst == 2 then
+                               ok_st = b & 0xE0u8 == 0xC0u8
+                       else if nxst == 3 then
+                               ok_st = b & 0xF0u8 == 0xE0u8
+                       else
+                               ok_st = b & 0xF8u8 == 0xF0u8
+                       end
+                       if not ok_st then
+                               if replacements == null then replacements = new Array[Int]
+                               replacements.add pos
+                               end_length += 2
+                               pos += 1
+                               chr_ln += 1
+                               continue
+                       end
+                       var ok_c: Bool
+                       var c = char_at(pos)
+                       var cp = c.code_point
+                       if nxst == 1 then
+                               ok_c = cp >= 0 and cp <= 0x7F
+                       else if nxst == 2 then
+                               ok_c = cp >= 0x80 and cp <= 0x7FF
+                       else if nxst == 3 then
+                               ok_c = cp >= 0x800 and cp <= 0xFFFF
+                               ok_c = ok_c and not (cp >= 0xD800 and cp <= 0xDFFF) and cp != 0xFFFE and cp != 0xFFFF
+                       else
+                               ok_c = cp >= 0x10000 and cp <= 0x10FFFF
+                       end
+                       if not ok_c then
+                               if replacements == null then replacements = new Array[Int]
+                               replacements.add pos
+                               end_length += 2
+                               pos += 1
+                               chr_ln += 1
+                               continue
+                       end
+                       pos += c.u8char_len
+                       chr_ln += 1
+               end
+               var ret = self
+               if end_length != len then
+                       ret = new NativeString(end_length)
+                       var old_repl = 0
+                       var off = 0
+                       var repls = replacements.as(not null)
+                       var r = repls.items.as(not null)
+                       var imax = repls.length
+                       for i in [0 .. imax[ do
+                               var repl_pos = r[i]
+                               var chkln = repl_pos - old_repl
+                               copy_to(ret, chkln, old_repl, off)
+                               off += chkln
+                               ret[off] = 0xEFu8
+                               ret[off + 1] = 0xBFu8
+                               ret[off + 2] = 0xBDu8
+                               old_repl = repl_pos + 1
+                               off += 3
+                       end
+                       copy_to(ret, len - old_repl, old_repl, off)
+               end
+               return new FlatString.full(ret, end_length, 0, end_length - 1, chr_ln)
+       end
+
        # Sets the next bytes at position `pos` to the value of `c`, encoded in UTF-8
        #
        # Very unsafe, make sure to have room for this char prior to calling this function.
@@ -1061,7 +1186,7 @@ redef class Array[E]
                        end
                        i += 1
                end
-               return ns.to_s_with_length(sl)
+               return new FlatString.with_infos(ns, sl, 0, sl - 1)
        end
 end
 
@@ -1098,7 +1223,7 @@ redef class NativeArray[E]
                        end
                        i += 1
                end
-               return ns.to_s_with_length(sl)
+               return new FlatString.with_infos(ns, sl, 0, sl - 1)
        end
 end