projects: update some short descriptions
[nit.git] / lib / standard / bytes.nit
index 2a3526f..59c4c5f 100644 (file)
@@ -46,6 +46,7 @@ class Bytes
                init(ns, 0, 0)
        end
 
+       # Init a `Bytes` with capacity `cap`
        init with_capacity(cap: Int) do
                var ns = new NativeString(cap)
                init(ns, 0, cap)
@@ -96,6 +97,16 @@ class Bytes
                end
        end
 
+       #     var b = new Bytes.empty
+       #     b.append([0x41u8, 0x41u8, 0x18u8])
+       #     b.pop
+       #     assert b.to_s == "AA"
+       redef fun pop do
+               assert length >= 1
+               length -= 1
+               return items[length]
+       end
+
        redef fun clear do length = 0
 
        # Regenerates the buffer, necessary when it was persisted
@@ -134,10 +145,81 @@ class Bytes
 
        redef fun to_s do
                persisted = true
-               return new FlatString.with_infos(items, length, 0, length -1)
+               var b = self
+               if not is_utf8 then
+                       b = clean_utf8
+                       persisted = false
+               end
+               return new FlatString.with_infos(b.items, b.length, 0, b.length -1)
        end
 
        redef fun iterator do return new BytesIterator.with_buffer(self)
+
+       # Is the byte collection valid UTF-8 ?
+       fun is_utf8: Bool do
+               var charst = once [0x80u8, 0u8, 0xE0u8, 0xC0u8, 0xF0u8, 0xE0u8, 0xF8u8, 0xF0u8]
+               var lobounds = once [0, 0x80, 0x800, 0x10000]
+               var hibounds = once [0x7F, 0x7FF, 0xFFFF, 0x10FFFF]
+               var pos = 0
+               var len = length
+               var mits = items
+               while pos < len do
+                       var nxst = mits.length_of_char_at(pos)
+                       var charst_index = (nxst - 1) * 2
+                       if mits[pos] & charst[charst_index] == charst[charst_index + 1] then
+                               var c = mits.char_at(pos)
+                               var cp = c.ascii
+                               if cp <= hibounds[nxst - 1] and cp >= lobounds[nxst - 1] then
+                                       if cp >= 0xD800 and cp <= 0xDFFF or
+                                          cp == 0xFFFE or cp == 0xFFFF then return false
+                               else
+                                       return false
+                               end
+                       else
+                               return false
+                       end
+                       pos += nxst
+               end
+               return true
+       end
+
+       # Cleans the bytes of `self` to be UTF-8 compliant
+       private fun clean_utf8: Bytes do
+               var charst = once [0x80u8, 0u8, 0xE0u8, 0xC0u8, 0xF0u8, 0xE0u8, 0xF8u8, 0xF0u8]
+               var badchar = once [0xEFu8, 0xBFu8, 0xBDu8]
+               var lobounds = once [0, 0x80, 0x800, 0x10000]
+               var hibounds = once [0x7F, 0x7FF, 0xFFFF, 0x10FFFF]
+               var pos = 0
+               var len = length
+               var ret = new Bytes.with_capacity(len)
+               var mits = items
+               while pos < len do
+                       var nxst = mits.length_of_char_at(pos)
+                       var charst_index = (nxst - 1) * 2
+                       if mits[pos] & charst[charst_index] == charst[charst_index + 1] then
+                               var c = mits.char_at(pos)
+                               var cp = c.ascii
+                               if cp <= hibounds[nxst - 1] and cp >= lobounds[nxst - 1] then
+                                       if cp >= 0xD800 and cp <= 0xDFFF or
+                                          cp == 0xFFFE or cp == 0xFFFF then
+                                               ret.append badchar
+                                               pos += 1
+                                       else
+                                               var pend = pos + nxst
+                                               for i in [pos .. pend[ do ret.add mits[i]
+                                               pos += nxst
+                                       end
+                               else
+                                       ret.append badchar
+                                       pos += 1
+                               end
+                       else
+                               ret.append badchar
+                               pos += 1
+                       end
+               end
+               return ret
+       end
 end
 
 private class BytesIterator
@@ -158,7 +240,37 @@ private class BytesIterator
        redef fun item do return tgt[index]
 end
 
+redef class Text
+       # Returns a mutable copy of `self`'s bytes
+       #
+       # ~~~nit
+       # assert "String".to_bytes isa Bytes
+       # assert "String".to_bytes == [83u8, 116u8, 114u8, 105u8, 110u8, 103u8]
+       # ~~~
+       fun to_bytes: Bytes do
+               var b = new Bytes.with_capacity(bytelen)
+               append_to_bytes b
+               return b
+       end
+
+       # Appends `self.bytes` to `b`
+       fun append_to_bytes(b: Bytes) do
+               for s in substrings do
+                       var from = if s isa FlatString then s.first_byte else 0
+                       b.append_ns_from(s.items, s.bytelen, from)
+               end
+       end
+end
+
+redef class FlatText
+       redef fun append_to_bytes(b) do
+               var from = if self isa FlatString then first_byte else 0
+               b.append_ns_from(items, bytelen, from)
+       end
+end
+
 redef class NativeString
+       # Creates a new `Bytes` object from `self` with `strlen` as length
        fun to_bytes: Bytes do
                var len = cstring_length
                return new Bytes(self, len, len)