lib/base64: Revamped `decode_base64` for better performances and introduction of...
authorLucas Bajolet <r4pass@hotmail.com>
Mon, 16 May 2016 15:36:51 +0000 (11:36 -0400)
committerLucas Bajolet <r4pass@hotmail.com>
Wed, 8 Jun 2016 19:51:23 +0000 (15:51 -0400)
Signed-off-by: Lucas Bajolet <r4pass@hotmail.com>

lib/base64.nit

index 8f6565b..29c5bbf 100644 (file)
 # Offers the base 64 encoding and decoding algorithms
 module base64
 
-redef class NativeString
-       # Alphabet used by the base64 algorithm
-       private fun base64_chars : SequenceRead[Byte]
-       do
-               return "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/".bytes
+redef class Char
+       # Is `self` a valid Base64 character ?
+       fun is_base64_char: Bool do
+               if code_point >= 127 then return false
+               return ascii.is_base64_char
        end
+end
 
-       # Reversed alphabet for base64
-       private fun inverted_base64_chars : HashMap[Byte, Byte]
-       do
-               var inv_base64_chars = new HashMap[Byte, Byte]
-               var l = base64_chars.length
-               for k in [0 .. l[ do
-                       inv_base64_chars[base64_chars[k]] = k.to_b
+redef class Byte
+       # Is `self` a valid Base64 character ?
+       fun is_base64_char: Bool do
+               if self == b'+' then return true
+               if self == b'/' then return true
+               if self > b'Z' then
+                       if self < b'a' then return false
+                       if self <= b'z' then return true
+                       return false
+               end
+               if self >= b'A' then return true
+               if self <= b'9' and self >= b'0' then return true
+               return false
+       end
+
+       # Returns the `base64` equivalent of `self`
+       #
+       # REQUIRE `self`.`is_base64_char`
+       fun to_base64_char: Byte do
+               if self == b'+' then return 62u8
+               if self == b'/' then return 63u8
+               if self > b'Z' then
+                       if self < b'a' then abort
+                       if self <= b'z' then return self - 71u8
+                       abort
                end
-               return inv_base64_chars
+               if self >= b'A' then return self - 0x41u8
+               if self <= b'9' and self >= b'0' then return self + 4u8
+               abort
+       end
+end
+
+redef class NativeString
+       # Alphabet used by the base64 algorithm
+       private fun base64_chars : Bytes
+       do
+               return b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
        end
 
        # Encodes `self` to base64.
@@ -40,9 +69,8 @@ redef class NativeString
        # By default, uses "=" for padding.
        #
        #     assert "string".encode_base64 == "c3RyaW5n"
-       private fun encode_base64(length: Int, padding: nullable Byte): Bytes do
+       private fun encode_base64(length: Int): Bytes do
                var base64_bytes = once base64_chars
-               if padding == null then padding = '='.ascii
                var steps = length / 3
                var bytes_in_last_step = length % 3
                var result_length = steps * 4
@@ -70,70 +98,102 @@ redef class NativeString
                        result.add base64_bytes[((self[in_off + 1] & 0b0000_1111u8) << 2).to_i]
                end
                var rempad = if bytes_in_last_step > 0 then 3 - bytes_in_last_step else 0
-               for i in [0 .. rempad[ do result.add padding
+               for i in [0 .. rempad[ do result.add b'='
 
                return result
        end
 
        # Decodes `self` from base64
        #
-       #      assert "c3RyaW5n".decode_base64 == "string"
-       #      assert "c3Rya\nW5n".decode_base64 == "string"
+       #      assert "c3RyaW5n".decode_base64.to_s == "string"
+       #      assert "c3Rya\nW5n".decode_base64.to_s == "string"
+       #      assert "c3RyaW5nCg==".decode_base64.to_s == "string\n"
+       #      assert "c3RyaW5nCg".decode_base64.to_s == "string\n"
+       #      assert "c3RyaW5neQo=".decode_base64.to_s == "stringy\n"
+       #      assert "c3RyaW5neQo".decode_base64.to_s == "stringy\n"
        #
-       # REQUIRE: `length % 4 == 0`
-       private fun decode_base64(length: Int, padding: nullable Byte): Bytes do
-               if padding == null then padding = '='.ascii
-               var inv = once inverted_base64_chars
+       private fun decode_base64(length: Int): Bytes do
                if length == 0 then return new Bytes.empty
 
-               # Remove non-base64 chars
-               var bytes = new Bytes.with_capacity(length)
-               for k in [0 .. length[ do
-                       var byte = self[k]
-                       if inv.has_key(byte) or byte == padding then bytes.add(byte)
+               # Avoids constant unboxing
+               var pad = b'='
+
+               var result = new Bytes.with_capacity((length / 4 + 1) * 3)
+
+               var curr = 0
+               var cnt = 0
+               var endpos = -1
+               for i in [0 .. length[ do
+                       var b = self[i]
+                       if b == pad then
+                               endpos = i
+                               break
+                       end
+                       # Ignore whitespaces
+                       if b <= 0x20u8 then continue
+                       if not b.is_base64_char then continue
+                       curr <<= 6
+                       curr += b.to_base64_char.to_i
+                       cnt += 1
+                       if cnt == 4 then
+                               result.add(((curr & 0xFF0000) >> 16).to_b)
+                               result.add(((curr & 0xFF00) >> 8).to_b)
+                               result.add((curr & 0xFF).to_b)
+                               curr = 0
+                               cnt = 0
+                       end
                end
-               length = bytes.length
-
-               var steps = length / 4
-               var result_length = steps * 3
-
-               var epos = length - 1
-               var padding_len = 0
-               while epos >= 0 and bytes[epos] == padding do
-                       epos -= 1
-                       padding_len += 1
+               if endpos != -1 or cnt != 0 then
+                       var pads = 0
+                       for i in [endpos .. length[ do
+                               var b = self[i]
+                               if b <= 0x20u8 then continue
+                               pads += 1
+                       end
+                       if cnt == 2 then
+                               curr >>= 4
+                               result.add(curr.to_b)
+                       else if cnt == 3 then
+                               curr >>= 2
+                               result.add(((curr & 0xFF00) >> 8).to_b)
+                               result.add((curr & 0xFF).to_b)
+                       end
                end
+               return result
+       end
 
-               if padding_len != 0 then steps -= 1
-               if padding_len == 1 then result_length -= 1
-               if padding_len == 2 then result_length -= 2
-
-               var result = new Bytes.with_capacity(result_length + 1)
+       # Is `self` a well-formed Base64 entity ?
+       #
+       # ~~~nit
+       #       assert "Qn03".is_base64
+       #       assert not "#sd=".is_base64
+       # ~~~
+       fun is_base64(length: Int): Bool do return check_base64(length) == null
 
-               for s in [0 .. steps[ do
-                       var c0 = inv[bytes[s * 4]]
-                       var c1 = inv[bytes[s * 4 + 1]]
-                       var c2 = inv[bytes[s * 4 + 2]]
-                       var c3 = inv[bytes[s * 4 + 3]]
-                       result.add (((c0 & 0b0011_1111u8) << 2) | ((c1 & 0b0011_0000u8) >> 4))
-                       result.add (((c1 & 0b0000_1111u8) << 4) | ((c2 & 0b0011_1100u8) >> 2))
-                       result.add (((c2 & 0b0000_0011u8) << 6) | (c3 & 0b0011_1111u8))
+       # Is `self` a well-formed Base64 entity ?
+       #
+       # Will return an Error otherwise with info on which part is erroneous.
+       fun check_base64(length: Int): nullable Error do
+               var rlen = 0
+               var opos = length
+               for i in [0 .. length[ do
+                       if self[i] == b'=' then
+                               opos = i
+                               break
+                       end
+                       if self[i].is_whitespace then continue
+                       if not self[i].is_base64_char then return new Error("Invalid Base64 character at position {i}: {self[i].ascii}")
+                       rlen += 1
+                       if rlen > 4 then rlen -= 4
                end
-
-               var last_start = steps * 4
-               if padding_len == 1 then
-                       var c0 = inv[bytes[last_start]]
-                       var c1 = inv[bytes[last_start + 1]]
-                       var c2 = inv[bytes[last_start + 2]]
-                       result.add (((c0 & 0b0011_1111u8) << 2) | ((c1 & 0b0011_0000u8) >> 4))
-                       result.add (((c1 & 0b0000_1111u8) << 4) | ((c2 & 0b0011_1100u8) >> 2))
-               else if padding_len == 2 then
-                       var c0 = inv[bytes[last_start]]
-                       var c1 = inv[bytes[last_start + 1]]
-                       result.add (((c0 & 0b0011_1111u8) << 2) | ((c1 & 0b0011_0000u8) >> 4))
+               var pad = 0
+               for i in [opos .. length[ do
+                       if self[i].is_whitespace then continue
+                       if self[i] != b'=' then return new Error("Invalid padding character {self[i].ascii} at position {i}")
+                       pad += 1
                end
-
-               return result
+               if rlen + pad != 4 then return new Error("Invalid padding length")
+               return null
        end
 end
 
@@ -142,35 +202,49 @@ redef class Bytes
        # Encodes the receiver string to base64 using a custom padding character.
        #
        # If using the default padding character `=`, see `encode_base64`.
-       fun encode_base64(padding: nullable Byte): Bytes
-       do
-               return items.encode_base64(length, padding)
-       end
+       fun encode_base64: Bytes do return items.encode_base64(length)
 
        # Decodes the receiver string to base64 using a custom padding character.
        #
        # Default padding character `=`
-       fun decode_base64(padding : nullable Byte) : Bytes
-       do
-               return items.decode_base64(length, padding)
-       end
+       fun decode_base64: Bytes do return items.decode_base64(length)
+
+       # Is `self` a well-formed Base64 entity ?
+       fun is_base64: Bool do return items.is_base64(length)
+
+       # Is `self` a well-formed Base64 entity ?
+       #
+       # Will return an Error otherwise with info on which part is erroneous.
+       fun check_base64: nullable Error do return items.check_base64(length)
 end
 
-redef class String
+redef class Text
 
        # Encodes the receiver string to base64 using a custom padding character.
        #
        # If using the default padding character `=`, see `encode_base64`.
-       fun encode_base64(padding: nullable Byte): String
-       do
-               return to_cstring.encode_base64(bytelen, padding).to_s
-       end
+       fun encode_base64: String do return to_cstring.encode_base64(bytelen).to_s
 
        # Decodes the receiver string to base64 using a custom padding character.
        #
        # Default padding character `=`
-       fun decode_base64(padding : nullable Byte) : String
-       do
-               return to_cstring.decode_base64(bytelen, padding).to_s
-       end
+       fun decode_base64: Bytes do return to_cstring.decode_base64(bytelen)
+
+       # Is `self` a well-formed Base64 entity ?
+       fun is_base64: Bool do return to_cstring.is_base64(bytelen)
+
+       # Is `self` a well-formed Base64 entity ?
+       #
+       # Will return an Error otherwise with info on which part is erroneous.
+       fun check_base64: nullable Error do return to_cstring.check_base64(bytelen)
+end
+
+redef class FlatText
+       redef fun encode_base64 do return fast_cstring.encode_base64(bytelen).to_s
+
+       redef fun decode_base64 do return fast_cstring.decode_base64(bytelen)
+
+       redef fun is_base64 do return fast_cstring.is_base64(bytelen)
+
+       redef fun check_base64 do return fast_cstring.check_base64(bytelen)
 end