Since quite some time now we've had the cleaning function for Bytes that ensured that what was coming from the exterior was clean and could be transformed safely to a String.
This is now generalized to any NativeString, and the clean function will be called each time a NativeString is `to_s`'d
At the same time, `clean_utf8` is now better performing (for `Files::read_all`, Ir per call is roughly 40% less than before), which limits the impacts of the new strategy.
Furthermore, the string produced by `NativeString::clean_utf8` has its length calculated which saves time on later operations on the string.
It also limits the number of calls by avoiding allocations if not necessary (if the string is already clean, which should happen a lot more often than not).
As for performances,
Valgrind `./bin/nitc src/nitc.nit`:
Before: 14.040 GIr
After: 13.859 GIr
Time, best of 10 for `./bin/nitc src/nitc.nit -o bin/nitc`:
Before: 0m4.989s
After: 0m4.933s
Time, best of 10 for `./bin/nitc --semi-global src/nitc.nit -o bin/nitc`:
Before: 0m4.696s
After: 0m4.691s
Pretty much equivalent in real time, and a bit better in Valgrind, not bad considering every String is now cleaner than ever !
Pull-Request: #1705
Reviewed-by: Jean Privat <jean@pryen.org>
Reviewed-by: Alexis Laferrière <alexis.laf@xymus.net>
module opportunity_controller
import nitcorn
-import sha1
import templates
import opportunity_model
redef fun commit(db) do
if id == "" then
var time = get_time
- var tmpid = (name + date + place + time.to_s).sha1_to_s
+ var tmpid = (name + date + place + time.to_s).sha1.hexdigest
if not db.execute("INSERT INTO meetups (id, name, date, place, answer_mode) VALUES({tmpid.to_sql_string}, {name.html_escape.to_sql_string}, {date.html_escape.to_sql_string}, {place.html_escape.to_sql_string}, {answer_mode});") then
print "Error recording entry Meetup {self}"
print db.error or else "Null error"
import sha1
-print "Rosetta Code".sha1_to_s
+print "Rosetta Code".sha1.hexdigest
# Offers the base 64 encoding and decoding algorithms
module base64
-redef class String
-
+redef class NativeString
# Alphabet used by the base64 algorithm
- private fun base64_chars : String
+ private fun base64_chars : SequenceRead[Byte]
do
- return "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
+ return "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/".bytes
end
+
+ # Reversed alphabet for base64
private fun inverted_base64_chars : HashMap[Byte, Byte]
do
var inv_base64_chars = new HashMap[Byte, Byte]
- for k in [0..base64_chars.bytelen[ do
- inv_base64_chars[base64_chars.bytes[k]] = k.to_b
+ var l = base64_chars.length
+ for k in [0 .. l[ do
+ inv_base64_chars[base64_chars[k]] = k.to_b
end
return inv_base64_chars
end
- # Encodes the receiver string to base64.
+ # Encodes `self` to base64.
+ #
# By default, uses "=" for padding.
- fun encode_base64 : String do return encode_base64_custom_padding('='.ascii.to_b)
-
- # Encodes the receiver string to base64 using a custom padding character.
#
- # If using the default padding character `=`, see `encode_base64`.
- fun encode_base64_custom_padding(padding : Byte) : String
- do
- var base64_bytes = once base64_chars.bytes
- var length = bytelen
-
+ # assert "string".encode_base64 == "c3RyaW5n"
+ private fun encode_base64(length: Int, padding: nullable Byte): Bytes do
+ var base64_bytes = once base64_chars
+ if padding == null then padding = '='.ascii.to_b
var steps = length / 3
var bytes_in_last_step = length % 3
var result_length = steps * 4
if bytes_in_last_step > 0 then result_length += 4
- var result = new NativeString(result_length + 1)
- var bytes = self.bytes
- result[result_length] = 0u8
-
- var mask_6bit = 0b0011_1111
+ var result = new Bytes.with_capacity(result_length)
+ var in_off = 0
for s in [0 .. steps[ do
- var e = 0
- for ss in [0 .. 3[ do
- e += bytes[s * 3 + ss].to_i << ((2 - ss) * 8)
- end
- for ss in [0..4[ do
- result[s * 4 + 3 - ss] = base64_bytes[(e >> (ss * 6)) & mask_6bit]
- end
+ var ind = ((self[in_off] & 0b1111_1100u8) >> 2).to_i
+ result.add base64_bytes[ind]
+ ind = ((self[in_off] & 0b0000_0011u8) << 4).to_i | ((self[in_off + 1] & 0b1111_0000u8) >> 4).to_i
+ result.add base64_bytes[ind]
+ ind = ((self[in_off + 1] & 0b0000_1111u8) << 2).to_i | ((self[in_off + 2] & 0b1100_0000u8) >> 6).to_i
+ result.add base64_bytes[ind]
+ ind = (self[in_off + 2] & 0b0011_1111u8).to_i
+ result.add base64_bytes[ind]
+ in_off += 3
end
-
- var out_off = result_length - 4
- var in_off = length - bytes_in_last_step
if bytes_in_last_step == 1 then
- result[out_off] = base64_bytes[((bytes[in_off] & 0b1111_1100u8) >> 2).to_i]
- result[out_off + 1] = base64_bytes[((bytes[in_off] & 0b0000_0011u8) << 4).to_i]
- out_off += 2
+ result.add base64_bytes[((self[in_off] & 0b1111_1100u8) >> 2).to_i]
+ result.add base64_bytes[((self[in_off] & 0b0000_0011u8) << 4).to_i]
else if bytes_in_last_step == 2 then
- result[out_off] = base64_bytes[((bytes[in_off] & 0b1111_1100u8) >> 2).to_i]
- result[out_off + 1] = base64_bytes[(((bytes[in_off] & 0b0000_0011u8) << 4) | ((bytes[in_off + 1] & 0b1111_0000u8) >> 4)).to_i]
- result[out_off + 2] = base64_bytes[((bytes[in_off + 1] & 0b0000_1111u8) << 2).to_i]
- out_off += 3
- end
- if bytes_in_last_step > 0 then
- for i in [out_off .. result_length[ do result[i] = padding
+ result.add base64_bytes[((self[in_off] & 0b1111_1100u8) >> 2).to_i]
+ result.add base64_bytes[(((self[in_off] & 0b0000_0011u8) << 4) | ((self[in_off + 1] & 0b1111_0000u8) >> 4)).to_i]
+ result.add base64_bytes[((self[in_off + 1] & 0b0000_1111u8) << 2).to_i]
end
+ var rempad = if bytes_in_last_step > 0 then 3 - bytes_in_last_step else 0
+ for i in [0 .. rempad[ do result.add padding
- return result.to_s_with_length(result_length)
+ return result
end
- # Decodes the receiver string from base64.
- # By default, uses "=" for padding.
- fun decode_base64 : String do return decode_base64_custom_padding('='.ascii.to_b)
-
- # Decodes the receiver string to base64 using a custom padding character.
+ # Decodes `self` from base64
#
- # If using the default padding character `=`, see `decode_base64`.
- fun decode_base64_custom_padding(padding : Byte) : String
- do
+ # assert "c3RyaW5n".decode_base64 == "string"
+ #
+ # REQUIRE: `length % 4 == 0`
+ private fun decode_base64(length: Int, padding: nullable Byte): Bytes do
+ if padding == null then padding = '='.ascii.to_b
var inv = once inverted_base64_chars
- var length = bytelen
- if length == 0 then return ""
+ if length == 0 then return new Bytes.empty
assert length % 4 == 0 else print "base64::decode_base64 only supports strings of length multiple of 4"
- var bytes = self.bytes
+ var bytes = self
var steps = length / 4
var result_length = steps * 3
if padding_len == 1 then result_length -= 1
if padding_len == 2 then result_length -= 2
- var result = new NativeString(result_length + 1)
- result[result_length] = 0u8
+ var result = new Bytes.with_capacity(result_length + 1)
for s in [0 .. steps[ do
var c0 = inv[bytes[s * 4]]
var c1 = inv[bytes[s * 4 + 1]]
var c2 = inv[bytes[s * 4 + 2]]
var c3 = inv[bytes[s * 4 + 3]]
- result[s * 3] = ((c0 & 0b0011_1111u8) << 2) | ((c1 & 0b0011_0000u8) >> 4)
- result[s * 3 + 1] = ((c1 & 0b0000_1111u8) << 4) | ((c2 & 0b0011_1100u8) >> 2)
- result[s * 3 + 2] = ((c2 & 0b0000_0011u8) << 6) | (c3 & 0b0011_1111u8)
+ result.add (((c0 & 0b0011_1111u8) << 2) | ((c1 & 0b0011_0000u8) >> 4))
+ result.add (((c1 & 0b0000_1111u8) << 4) | ((c2 & 0b0011_1100u8) >> 2))
+ result.add (((c2 & 0b0000_0011u8) << 6) | (c3 & 0b0011_1111u8))
end
var last_start = steps * 4
var c0 = inv[bytes[last_start]]
var c1 = inv[bytes[last_start + 1]]
var c2 = inv[bytes[last_start + 2]]
- result[result_length - 2] = ((c0 & 0b0011_1111u8) << 2) | ((c1 & 0b0011_0000u8) >> 4)
- result[result_length - 1] = ((c1 & 0b0000_1111u8) << 4) | ((c2 & 0b0011_1100u8) >> 2)
+ result.add (((c0 & 0b0011_1111u8) << 2) | ((c1 & 0b0011_0000u8) >> 4))
+ result.add (((c1 & 0b0000_1111u8) << 4) | ((c2 & 0b0011_1100u8) >> 2))
else if padding_len == 2 then
var c0 = inv[bytes[last_start]]
var c1 = inv[bytes[last_start + 1]]
- result[result_length - 1] = ((c0 & 0b0011_1111u8) << 2) | ((c1 & 0b0011_0000u8) >> 4)
+ result.add (((c0 & 0b0011_1111u8) << 2) | ((c1 & 0b0011_0000u8) >> 4))
end
- return result.to_s_with_length(result_length)
+ return result
+ end
+end
+
+redef class Bytes
+
+ # Encodes the receiver string to base64 using a custom padding character.
+ #
+ # If using the default padding character `=`, see `encode_base64`.
+ fun encode_base64(padding: nullable Byte): Bytes
+ do
+ return items.encode_base64(length, padding)
+ end
+
+ # Decodes the receiver string to base64 using a custom padding character.
+ #
+ # Default padding character `=`
+ fun decode_base64(padding : nullable Byte) : Bytes
+ do
+ return items.decode_base64(length, padding)
+ end
+end
+
+redef class String
+
+ # Encodes the receiver string to base64 using a custom padding character.
+ #
+ # If using the default padding character `=`, see `encode_base64`.
+ fun encode_base64(padding: nullable Byte): String
+ do
+ return to_cstring.encode_base64(bytelen, padding).to_s
+ end
+
+ # Decodes the receiver string to base64 using a custom padding character.
+ #
+ # Default padding character `=`
+ fun decode_base64(padding : nullable Byte) : String
+ do
+ return to_cstring.decode_base64(bytelen, padding).to_s
end
end
import collection::array
intrude import text::flat
+redef class Byte
+ # Write self as a string into `ns` at position `pos`
+ private fun add_digest_at(ns: NativeString, pos: Int) do
+ var tmp = (0xF0u8 & self) >> 4
+ ns[pos] = if tmp >= 0x0Au8 then tmp + 0x37u8 else tmp + 0x30u8
+ tmp = 0x0Fu8 & self
+ ns[pos + 1] = if tmp >= 0x0Au8 then tmp + 0x37u8 else tmp + 0x30u8
+ end
+end
+
# A buffer containing Byte-manipulation facilities
#
# Uses Copy-On-Write when persisted
super AbstractArray[Byte]
# A NativeString being a char*, it can be used as underlying representation here.
- private var items: NativeString
+ var items: NativeString
# Number of bytes in the array
redef var length
return items[i]
end
+ # Returns self as a hexadecimal digest
+ fun hexdigest: String do
+ var elen = length * 2
+ var ns = new NativeString(elen)
+ var i = 0
+ var oi = 0
+ while i < length do
+ self[i].add_digest_at(ns, oi)
+ i += 1
+ oi += 2
+ end
+ return new FlatString.full(ns, elen, 0, elen - 1, elen)
+ end
+
# var b = new Bytes.with_capacity(1)
# b[0] = 101u8
# assert b.to_s == "e"
redef fun to_s do
persisted = true
var b = self
- if not is_utf8 then
- b = clean_utf8
- persisted = false
- end
- return new FlatString.with_infos(b.items, b.length, 0, b.length -1)
+ var r = b.items.to_s_with_length(length)
+ if r != items then persisted = false
+ return r
end
redef fun iterator do return new BytesIterator.with_buffer(self)
- # Is the byte collection valid UTF-8 ?
- fun is_utf8: Bool do
- var charst = once [0x80u8, 0u8, 0xE0u8, 0xC0u8, 0xF0u8, 0xE0u8, 0xF8u8, 0xF0u8]
- var lobounds = once [0, 0x80, 0x800, 0x10000]
- var hibounds = once [0x7F, 0x7FF, 0xFFFF, 0x10FFFF]
- var pos = 0
- var len = length
- var mits = items
- while pos < len do
- var nxst = mits.length_of_char_at(pos)
- var charst_index = (nxst - 1) * 2
- if mits[pos] & charst[charst_index] == charst[charst_index + 1] then
- var c = mits.char_at(pos)
- var cp = c.ascii
- if cp <= hibounds[nxst - 1] and cp >= lobounds[nxst - 1] then
- if cp >= 0xD800 and cp <= 0xDFFF or
- cp == 0xFFFE or cp == 0xFFFF then return false
- else
- return false
- end
- else
- return false
- end
- pos += nxst
- end
- return true
- end
-
- # Cleans the bytes of `self` to be UTF-8 compliant
- private fun clean_utf8: Bytes do
- var charst = once [0x80u8, 0u8, 0xE0u8, 0xC0u8, 0xF0u8, 0xE0u8, 0xF8u8, 0xF0u8]
- var badchar = once [0xEFu8, 0xBFu8, 0xBDu8]
- var lobounds = once [0, 0x80, 0x800, 0x10000]
- var hibounds = once [0x7F, 0x7FF, 0xFFFF, 0x10FFFF]
- var pos = 0
- var len = length
- var ret = new Bytes.with_capacity(len)
- var mits = items
- while pos < len do
- var nxst = mits.length_of_char_at(pos)
- var charst_index = (nxst - 1) * 2
- if mits[pos] & charst[charst_index] == charst[charst_index + 1] then
- var c = mits.char_at(pos)
- var cp = c.ascii
- if cp <= hibounds[nxst - 1] and cp >= lobounds[nxst - 1] then
- if cp >= 0xD800 and cp <= 0xDFFF or
- cp == 0xFFFE or cp == 0xFFFF then
- ret.append badchar
- pos += 1
- else
- var pend = pos + nxst
- for i in [pos .. pend[ do ret.add mits[i]
- pos += nxst
- end
- else
- ret.append badchar
- pos += 1
- end
- else
- ret.append badchar
- pos += 1
- end
- end
- return ret
- end
end
private class BytesIterator
# ~~~
fun read_all: String do
var s = read_all_bytes
- if not s.is_utf8 then s = s.clean_utf8
var slen = s.length
if slen == 0 then return ""
var rets = ""
var pos = 0
- var sits = s.items
+ var str = s.items.clean_utf8(slen)
+ slen = str.bytelen
+ var sits = str.items
var remsp = slen
while pos < slen do
# The 129 size was decided more or less arbitrarily
redef fun to_s_with_length(length): FlatString
do
assert length >= 0
- var str = new FlatString.with_infos(self, length, 0, length - 1)
- return str
+ return clean_utf8(length)
end
redef fun to_s_full(bytelen, unilen) do
redef fun to_s_with_copy: FlatString
do
var length = cstring_length
+ var r = clean_utf8(length)
+ if r.items != self then return r
var new_self = new NativeString(length + 1)
copy_to(new_self, length, 0, 0)
var str = new FlatString.with_infos(new_self, length, 0, length - 1)
return str
end
+ # Cleans a NativeString if necessary
+ fun clean_utf8(len: Int): FlatString do
+ var replacements: nullable Array[Int] = null
+ var end_length = len
+ var pos = 0
+ var chr_ln = 0
+ while pos < len do
+ var b = self[pos]
+ var nxst = length_of_char_at(pos)
+ var ok_st: Bool
+ if nxst == 1 then
+ ok_st = b & 0x80u8 == 0u8
+ else if nxst == 2 then
+ ok_st = b & 0xE0u8 == 0xC0u8
+ else if nxst == 3 then
+ ok_st = b & 0xF0u8 == 0xE0u8
+ else
+ ok_st = b & 0xF8u8 == 0xF0u8
+ end
+ if not ok_st then
+ if replacements == null then replacements = new Array[Int]
+ replacements.add pos
+ end_length += 2
+ pos += 1
+ chr_ln += 1
+ continue
+ end
+ var ok_c: Bool
+ var c = char_at(pos)
+ var cp = c.ascii
+ if nxst == 1 then
+ ok_c = cp >= 0 and cp <= 0x7F
+ else if nxst == 2 then
+ ok_c = cp >= 0x80 and cp <= 0x7FF
+ else if nxst == 3 then
+ ok_c = cp >= 0x800 and cp <= 0xFFFF
+ ok_c = ok_c and not (cp >= 0xD800 and cp <= 0xDFFF) and cp != 0xFFFE and cp != 0xFFFF
+ else
+ ok_c = cp >= 0x10000 and cp <= 0x10FFFF
+ end
+ if not ok_c then
+ if replacements == null then replacements = new Array[Int]
+ replacements.add pos
+ end_length += 2
+ pos += 1
+ chr_ln += 1
+ continue
+ end
+ pos += c.u8char_len
+ chr_ln += 1
+ end
+ var ret = self
+ if end_length != len then
+ ret = new NativeString(end_length)
+ var old_repl = 0
+ var off = 0
+ var repls = replacements.as(not null)
+ var r = repls.items.as(not null)
+ var imax = repls.length
+ for i in [0 .. imax[ do
+ var repl_pos = r[i]
+ var chkln = repl_pos - old_repl
+ copy_to(ret, chkln, old_repl, off)
+ off += chkln
+ ret[off] = 0xEFu8
+ ret[off + 1] = 0xBFu8
+ ret[off + 2] = 0xBDu8
+ old_repl = repl_pos + 1
+ off += 3
+ end
+ copy_to(ret, len - old_repl, old_repl, off)
+ end
+ return new FlatString.full(ret, end_length, 0, end_length - 1, chr_ln)
+ end
+
# Sets the next bytes at position `pos` to the value of `c`, encoded in UTF-8
#
# Very unsafe, make sure to have room for this char prior to calling this function.
end
i += 1
end
- return ns.to_s_with_length(sl)
+ return new FlatString.with_infos(ns, sl, 0, sl - 1)
end
end
end
i += 1
end
- return ns.to_s_with_length(sl)
+ return new FlatString.with_infos(ns, sl, 0, sl - 1)
end
end
# This file is part of NIT (http://www.nitlanguage.org).
#
-# Copyright 2014 Lucas Bajolet <r4pass@hotmail.com>
-#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
}
`}
-redef class String
-
- # Computes the SHA1 of the receiver
- #
- # Returns a digest of 20 bytes as a String,
- # note that all the characters are not necessarily ASCII.
- # If you want the hex string version of the digest, use
- # sha1_to_s.
- #
- # import base64
- # assert "The quick brown fox jumps over the lazy dog".sha1.encode_base64 == "L9ThxnotKPzthJ7hu3bnORuT6xI="
- fun sha1: String import String.to_cstring, String.length, NativeString.to_s_with_length `{
+redef class NativeString
+ private fun sha1_intern(len: Int): NativeString `{
sha1nfo s;
sha1_init(&s);
- sha1_write(&s, String_to_cstring(self), String_length(self));
+ sha1_write(&s, self, len);
uint8_t* digest = sha1_result(&s);
char* digested = malloc(21);
digested[20] = '\0';
- return NativeString_to_s_with_length(digested, 20);
+ return digested;
`}
+end
+
+redef class String
+
+ # Computes the SHA1 of the receiver
+ #
+ # Returns a digest of 20 bytes as a NativeString,
+ # note that all the characters are not necessarily ASCII.
+ # If you want the hex string version of the digest, use
+ # sha1_hexdigest.
+ #
+ # import base64
+ # assert "The quick brown fox jumps over the lazy dog".sha1 == [0x2Fu8, 0xD4u8, 0xE1u8, 0xC6u8, 0x7Au8, 0x2Du8, 0x28u8, 0xFCu8, 0xEDu8, 0x84u8, 0x9Eu8, 0xE1u8, 0xBBu8, 0x76u8, 0xE7u8, 0x39u8, 0x1Bu8, 0x93u8, 0xEBu8, 0x12u8]
+ fun sha1: Bytes do
+ return new Bytes(to_cstring.sha1_intern(bytelen), 20, 20)
+ end
# Computes the SHA1 of the receiver.
#
# Returns a 40 char String containing the Hexadecimal
# Digest in its Char form.
#
- # assert "The quick brown fox jumps over the lazy dog".sha1_to_s == "2FD4E1C67A2D28FCED849EE1BB76E7391B93EB12"
- fun sha1_to_s: String import String.to_cstring, String.length, NativeString.to_s_with_length `{
- sha1nfo s;
-
- sha1_init(&s);
- sha1_write(&s, String_to_cstring(self), String_length(self));
- uint8_t* digest = sha1_result(&s);
-
- char* ret_str = malloc(41);
- char* hexmap = "0123456789ABCDEF";
-
- int i;
- for(i=0;i<20;i++){
- uint8_t q = digest[i];
- ret_str[i*2] = hexmap[q >> 4];
- ret_str[(i*2)+1] = hexmap[q & 0x0F];
- }
- ret_str[40] = '\0';
-
- return NativeString_to_s_with_length(ret_str, 40);
- `}
-
+ # assert "The quick brown fox jumps over the lazy dog".sha1_hexdigest == "2FD4E1C67A2D28FCED849EE1BB76E7391B93EB12"
+ fun sha1_hexdigest: String do return sha1.hexdigest
end
-
resp_map["Connection:"] = "Upgrade"
var key = heads["Sec-WebSocket-Key"]
key += "258EAFA5-E914-47DA-95CA-C5AB0DC85B11"
- key = key.sha1.encode_base64
+ key = key.sha1.encode_base64.to_s
resp_map["Sec-WebSocket-Accept:"] = key
var resp = resp_map.join("\r\n", " ")
resp += "\r\n\r\n"
FlatString = 18
Calls to first_byte on FlatString 153
Calls to last_byte on FlatString 103
-FlatStrings allocated with length 81 (85.417%)
+FlatStrings allocated with length 82 (86.458%)
Length of travel for index distribution:
* null = 20 => occurences 83.333%, cumulative 83.333%
* 1 = 8 => occurences 21.053%, cumulative 73.684%
if name == "Array[nullable Object]" then return new Array[nullable Object].from_deserializer(self)
if name == "Array[Serializable]" then return new Array[Serializable].from_deserializer(self)
if name == "Array[Object]" then return new Array[Object].from_deserializer(self)
+ if name == "Array[Int]" then return new Array[Int].from_deserializer(self)
if name == "Array[Match]" then return new Array[Match].from_deserializer(self)
if name == "Array[nullable Match]" then return new Array[nullable Match].from_deserializer(self)
return super
FlatString = 18
Calls to first_byte on FlatString 153
Calls to last_byte on FlatString 103
-FlatStrings allocated with length 81 (85.417%)
+FlatStrings allocated with length 82 (86.458%)
Length of travel for index distribution:
* 0 = 20 => occurences 83.333%, cumulative 83.333%
* 1 = 8 => occurences 21.053%, cumulative 73.684%