core :: CString :: clean_utf8
# Cleans a CString if necessary
fun clean_utf8(len: Int): FlatString do
var replacements: nullable Array[Int] = null
var end_length = len
var pos = 0
var chr_ln = 0
var rem = len
while rem > 0 do
while rem >= 4 do
var i = fetch_4_chars(pos)
if i & 0x80808080u32 != 0u32 then break
pos += 4
chr_ln += 4
rem -= 4
end
if rem == 0 then break
var b = self[pos]
if b & 0x80 == 0x00 then
pos += 1
chr_ln += 1
rem -= 1
continue
end
var nxst = length_of_char_at(pos)
var ok_st: Bool
if nxst == 1 then
ok_st = b & 0x80 == 0
else if nxst == 2 then
ok_st = b & 0xE0 == 0xC0
else if nxst == 3 then
ok_st = b & 0xF0 == 0xE0
else
ok_st = b & 0xF8 == 0xF0
end
if not ok_st then
if replacements == null then replacements = new Array[Int]
replacements.add pos
end_length += 2
pos += 1
rem -= 1
chr_ln += 1
continue
end
var ok_c: Bool
var c = char_at(pos)
var cp = c.code_point
if nxst == 1 then
ok_c = cp >= 0 and cp <= 0x7F
else if nxst == 2 then
ok_c = cp >= 0x80 and cp <= 0x7FF
else if nxst == 3 then
ok_c = cp >= 0x800 and cp <= 0xFFFF
ok_c = ok_c and not (cp >= 0xD800 and cp <= 0xDFFF) and cp != 0xFFFE and cp != 0xFFFF
else
ok_c = cp >= 0x10000 and cp <= 0x10FFFF
end
if not ok_c then
if replacements == null then replacements = new Array[Int]
replacements.add pos
end_length += 2
pos += 1
chr_ln += 1
rem -= 1
continue
end
var clen = c.u8char_len
pos += clen
rem -= clen
chr_ln += 1
end
var ret = self
if end_length != len then
ret = new CString(end_length)
var old_repl = 0
var off = 0
var repls = replacements.as(not null)
var r = repls.items.as(not null)
var imax = repls.length
for i in [0 .. imax[ do
var repl_pos = r[i]
var chkln = repl_pos - old_repl
copy_to(ret, chkln, old_repl, off)
off += chkln
ret[off] = 0xEF
ret[off + 1] = 0xBF
ret[off + 2] = 0xBD
old_repl = repl_pos + 1
off += 3
end
copy_to(ret, len - old_repl, old_repl, off)
end
return new FlatString.full(ret, end_length, 0, chr_ln)
end
lib/core/text/flat.nit:1345,2--1437,4