lib/core/flat: Added clean_utf8 method to NativeString
authorLucas Bajolet <r4pass@hotmail.com>
Fri, 11 Sep 2015 15:34:56 +0000 (11:34 -0400)
committerLucas Bajolet <r4pass@hotmail.com>
Fri, 11 Sep 2015 15:34:56 +0000 (11:34 -0400)
Signed-off-by: Lucas Bajolet <r4pass@hotmail.com>

lib/core/text/flat.nit

index c8b6ecd..7eebc08 100644 (file)
@@ -1005,6 +1005,81 @@ redef class NativeString
                return str
        end
 
+       # Cleans a NativeString if necessary
+       fun clean_utf8(len: Int): FlatString do
+               var replacements: nullable Array[Int] = null
+               var end_length = len
+               var pos = 0
+               var chr_ln = 0
+               while pos < len do
+                       var b = self[pos]
+                       var nxst = length_of_char_at(pos)
+                       var ok_st: Bool
+                       if nxst == 1 then
+                               ok_st = b & 0x80u8 == 0u8
+                       else if nxst == 2 then
+                               ok_st = b & 0xE0u8 == 0xC0u8
+                       else if nxst == 3 then
+                               ok_st = b & 0xF0u8 == 0xE0u8
+                       else
+                               ok_st = b & 0xF8u8 == 0xF0u8
+                       end
+                       if not ok_st then
+                               if replacements == null then replacements = new Array[Int]
+                               replacements.add pos
+                               end_length += 2
+                               pos += 1
+                               chr_ln += 1
+                               continue
+                       end
+                       var ok_c: Bool
+                       var c = char_at(pos)
+                       var cp = c.ascii
+                       if nxst == 1 then
+                               ok_c = cp >= 0 and cp <= 0x7F
+                       else if nxst == 2 then
+                               ok_c = cp >= 0x80 and cp <= 0x7FF
+                       else if nxst == 3 then
+                               ok_c = cp >= 0x800 and cp <= 0xFFFF
+                               ok_c = ok_c and not (cp >= 0xD800 and cp <= 0xDFFF) and cp != 0xFFFE and cp != 0xFFFF
+                       else
+                               ok_c = cp >= 0x10000 and cp <= 0x10FFFF
+                       end
+                       if not ok_c then
+                               if replacements == null then replacements = new Array[Int]
+                               replacements.add pos
+                               end_length += 2
+                               pos += 1
+                               chr_ln += 1
+                               continue
+                       end
+                       pos += c.u8char_len
+                       chr_ln += 1
+               end
+               var ret = self
+               if end_length != len then
+                       ret = new NativeString(end_length)
+                       var old_repl = 0
+                       var off = 0
+                       var repls = replacements.as(not null)
+                       var r = repls.items.as(not null)
+                       var imax = repls.length
+                       for i in [0 .. imax[ do
+                               var repl_pos = r[i]
+                               var chkln = repl_pos - old_repl
+                               copy_to(ret, chkln, old_repl, off)
+                               off += chkln
+                               ret[off] = 0xEFu8
+                               ret[off + 1] = 0xBFu8
+                               ret[off + 2] = 0xBDu8
+                               old_repl = repl_pos + 1
+                               off += 3
+                       end
+                       copy_to(ret, len - old_repl, old_repl, off)
+               end
+               return new FlatString.full(ret, end_length, 0, end_length - 1, chr_ln)
+       end
+
        # Sets the next bytes at position `pos` to the value of `c`, encoded in UTF-8
        #
        # Very unsafe, make sure to have room for this char prior to calling this function.