From: Lucas Bajolet Date: Wed, 5 Aug 2015 19:35:53 +0000 (-0400) Subject: lib/standard/bytes: Added cleaning method on Bytes when converting to String X-Git-Tag: v0.7.8~80^2~6 X-Git-Url: http://nitlanguage.org lib/standard/bytes: Added cleaning method on Bytes when converting to String Signed-off-by: Lucas Bajolet --- diff --git a/lib/standard/bytes.nit b/lib/standard/bytes.nit index b8cc1fb..0e71a90 100644 --- a/lib/standard/bytes.nit +++ b/lib/standard/bytes.nit @@ -149,6 +149,72 @@ class Bytes end redef fun iterator do return new BytesIterator.with_buffer(self) + + # Is the byte collection valid UTF-8 ? + fun is_utf8: Bool do + var charst = once [0x80u8, 0u8, 0xE0u8, 0xC0u8, 0xF0u8, 0xE0u8, 0xF8u8, 0xF0u8] + var lobounds = once [0, 0x80, 0x800, 0x10000] + var hibounds = once [0x7F, 0x7FF, 0xFFFF, 0x10FFFF] + var pos = 0 + var len = length + var mits = items + while pos < len do + var nxst = mits.length_of_char_at(pos) + var charst_index = (nxst - 1) * 2 + if mits[pos] & charst[charst_index] == charst[charst_index + 1] then + var c = mits.char_at(pos) + var cp = c.ascii + if cp <= hibounds[nxst - 1] and cp >= lobounds[nxst - 1] then + if cp >= 0xD800 and cp <= 0xDFFF or + cp == 0xFFFE or cp == 0xFFFF then return false + else + return false + end + else + return false + end + pos += nxst + end + return true + end + + # Cleans the bytes of `self` to be UTF-8 compliant + private fun clean_utf8: Bytes do + var charst = once [0x80u8, 0u8, 0xE0u8, 0xC0u8, 0xF0u8, 0xE0u8, 0xF8u8, 0xF0u8] + var badchar = once [0xEFu8, 0xBFu8, 0xBDu8] + var lobounds = once [0, 0x80, 0x800, 0x10000] + var hibounds = once [0x7F, 0x7FF, 0xFFFF, 0x10FFFF] + var pos = 0 + var len = length + var ret = new Bytes.with_capacity(len) + var mits = items + while pos < len do + var nxst = mits.length_of_char_at(pos) + var charst_index = (nxst - 1) * 2 + if mits[pos] & charst[charst_index] == charst[charst_index + 1] then + var c = mits.char_at(pos) + var cp = c.ascii + if cp <= hibounds[nxst - 1] and cp >= lobounds[nxst - 1] then + if cp >= 0xD800 and cp <= 0xDFFF or + cp == 0xFFFE or cp == 0xFFFF then + ret.append badchar + pos += 1 + else + var pend = pos + nxst + for i in [pos .. pend[ do ret.add mits[i] + pos += nxst + end + else + ret.append badchar + pos += 1 + end + else + ret.append badchar + pos += 1 + end + end + return ret + end end private class BytesIterator