From: Lucas Bajolet Date: Fri, 11 Sep 2015 15:34:56 +0000 (-0400) Subject: lib/core/flat: Added clean_utf8 method to NativeString X-Git-Tag: v0.7.8~23^2~4 X-Git-Url: http://nitlanguage.org lib/core/flat: Added clean_utf8 method to NativeString Signed-off-by: Lucas Bajolet --- diff --git a/lib/core/text/flat.nit b/lib/core/text/flat.nit index c8b6ecd..7eebc08 100644 --- a/lib/core/text/flat.nit +++ b/lib/core/text/flat.nit @@ -1005,6 +1005,81 @@ redef class NativeString return str end + # Cleans a NativeString if necessary + fun clean_utf8(len: Int): FlatString do + var replacements: nullable Array[Int] = null + var end_length = len + var pos = 0 + var chr_ln = 0 + while pos < len do + var b = self[pos] + var nxst = length_of_char_at(pos) + var ok_st: Bool + if nxst == 1 then + ok_st = b & 0x80u8 == 0u8 + else if nxst == 2 then + ok_st = b & 0xE0u8 == 0xC0u8 + else if nxst == 3 then + ok_st = b & 0xF0u8 == 0xE0u8 + else + ok_st = b & 0xF8u8 == 0xF0u8 + end + if not ok_st then + if replacements == null then replacements = new Array[Int] + replacements.add pos + end_length += 2 + pos += 1 + chr_ln += 1 + continue + end + var ok_c: Bool + var c = char_at(pos) + var cp = c.ascii + if nxst == 1 then + ok_c = cp >= 0 and cp <= 0x7F + else if nxst == 2 then + ok_c = cp >= 0x80 and cp <= 0x7FF + else if nxst == 3 then + ok_c = cp >= 0x800 and cp <= 0xFFFF + ok_c = ok_c and not (cp >= 0xD800 and cp <= 0xDFFF) and cp != 0xFFFE and cp != 0xFFFF + else + ok_c = cp >= 0x10000 and cp <= 0x10FFFF + end + if not ok_c then + if replacements == null then replacements = new Array[Int] + replacements.add pos + end_length += 2 + pos += 1 + chr_ln += 1 + continue + end + pos += c.u8char_len + chr_ln += 1 + end + var ret = self + if end_length != len then + ret = new NativeString(end_length) + var old_repl = 0 + var off = 0 + var repls = replacements.as(not null) + var r = repls.items.as(not null) + var imax = repls.length + for i in [0 .. imax[ do + var repl_pos = r[i] + var chkln = repl_pos - old_repl + copy_to(ret, chkln, old_repl, off) + off += chkln + ret[off] = 0xEFu8 + ret[off + 1] = 0xBFu8 + ret[off + 2] = 0xBDu8 + old_repl = repl_pos + 1 + off += 3 + end + copy_to(ret, len - old_repl, old_repl, off) + end + return new FlatString.full(ret, end_length, 0, end_length - 1, chr_ln) + end + # Sets the next bytes at position `pos` to the value of `c`, encoded in UTF-8 # # Very unsafe, make sure to have room for this char prior to calling this function.