lib/standard/bytes: Added cleaning method on Bytes when converting to String
authorLucas Bajolet <r4pass@hotmail.com>
Wed, 5 Aug 2015 19:35:53 +0000 (15:35 -0400)
committerLucas Bajolet <r4pass@hotmail.com>
Wed, 12 Aug 2015 18:53:48 +0000 (14:53 -0400)
Signed-off-by: Lucas Bajolet <r4pass@hotmail.com>

lib/standard/bytes.nit

index b8cc1fb..0e71a90 100644 (file)
@@ -149,6 +149,72 @@ class Bytes
        end
 
        redef fun iterator do return new BytesIterator.with_buffer(self)
+
+       # Is the byte collection valid UTF-8 ?
+       fun is_utf8: Bool do
+               var charst = once [0x80u8, 0u8, 0xE0u8, 0xC0u8, 0xF0u8, 0xE0u8, 0xF8u8, 0xF0u8]
+               var lobounds = once [0, 0x80, 0x800, 0x10000]
+               var hibounds = once [0x7F, 0x7FF, 0xFFFF, 0x10FFFF]
+               var pos = 0
+               var len = length
+               var mits = items
+               while pos < len do
+                       var nxst = mits.length_of_char_at(pos)
+                       var charst_index = (nxst - 1) * 2
+                       if mits[pos] & charst[charst_index] == charst[charst_index + 1] then
+                               var c = mits.char_at(pos)
+                               var cp = c.ascii
+                               if cp <= hibounds[nxst - 1] and cp >= lobounds[nxst - 1] then
+                                       if cp >= 0xD800 and cp <= 0xDFFF or
+                                          cp == 0xFFFE or cp == 0xFFFF then return false
+                               else
+                                       return false
+                               end
+                       else
+                               return false
+                       end
+                       pos += nxst
+               end
+               return true
+       end
+
+       # Cleans the bytes of `self` to be UTF-8 compliant
+       private fun clean_utf8: Bytes do
+               var charst = once [0x80u8, 0u8, 0xE0u8, 0xC0u8, 0xF0u8, 0xE0u8, 0xF8u8, 0xF0u8]
+               var badchar = once [0xEFu8, 0xBFu8, 0xBDu8]
+               var lobounds = once [0, 0x80, 0x800, 0x10000]
+               var hibounds = once [0x7F, 0x7FF, 0xFFFF, 0x10FFFF]
+               var pos = 0
+               var len = length
+               var ret = new Bytes.with_capacity(len)
+               var mits = items
+               while pos < len do
+                       var nxst = mits.length_of_char_at(pos)
+                       var charst_index = (nxst - 1) * 2
+                       if mits[pos] & charst[charst_index] == charst[charst_index + 1] then
+                               var c = mits.char_at(pos)
+                               var cp = c.ascii
+                               if cp <= hibounds[nxst - 1] and cp >= lobounds[nxst - 1] then
+                                       if cp >= 0xD800 and cp <= 0xDFFF or
+                                          cp == 0xFFFE or cp == 0xFFFF then
+                                               ret.append badchar
+                                               pos += 1
+                                       else
+                                               var pend = pos + nxst
+                                               for i in [pos .. pend[ do ret.add mits[i]
+                                               pos += nxst
+                                       end
+                               else
+                                       ret.append badchar
+                                       pos += 1
+                               end
+                       else
+                               ret.append badchar
+                               pos += 1
+                       end
+               end
+               return ret
+       end
 end
 
 private class BytesIterator