This simple PR re-introduces the concept of ASCII strings for faster no-overhead accesses to random characters in a string.
The class is private as it is to be used only internally, no client should care for the specific type of a String and all performance improvements will be automatic, should the optimization be possible.
Since we are dealing with UTF-8, the only case where O(1) access can efficiently and without much overhead (one `if` statement at creation) be determined, is when a string only contains ASCII characters.
Note that since the Buffer can change access semantics depending on what is appended during its lifetime, no ASCII-only FlatBuffer has been added.
This is implemented here by introducing two subclasses of FlatString, one for ASCII only strings, and one for regular Unicode strings (the old FlatString).
In terms of performance, in the compiler, there is a ~0.5% improvement, but in specific cases like JSON parsing for the `big_gov_data.json` (where not a single unicode character is present) bench, the difference amounts to ~35%.
Pull-Request: #2040
Reviewed-by: Jean Privat <jean@pryen.org>
{
- "message": "Wrong type for `soin` (expected Int got FlatString)"
+ "message": "Wrong type for `soin` (expected Int got ASCIIFlatString)"
}
end
# Immutable strings of characters.
-class FlatString
+abstract class FlatString
super FlatText
super String
redef fun fast_cstring do return _items.fast_cstring(_first_byte)
- redef fun substring_from(from) do
- if from >= self._length then return empty
- if from <= 0 then return self
- var c = char_to_byte_index(from)
- var st = c - _first_byte
- var fln = bytelen - st
- return new FlatString.full(items, fln, c, _length - from)
- end
-
redef fun substring(from, count)
do
if count <= 0 then return ""
#
# `_items` will be used as is, without copy, to retrieve the characters of the string.
# Aliasing issues is the responsibility of the caller.
- private init with_infos(items: NativeString, bytelen, from: Int)
+ private new with_infos(items: NativeString, bytelen, from: Int)
do
- self._items = items
- self._bytelen = bytelen
- _first_byte = from
- _bytepos = from
- _length = _items.utf8_length(_first_byte, bytelen)
+ var len = items.utf8_length(from, bytelen)
+ if bytelen == len then return new ASCIIFlatString.full_data(items, bytelen, from, len)
+ return new UnicodeFlatString.full_data(items, bytelen, from, len)
end
# Low-level creation of a new string with all the data.
#
# `_items` will be used as is, without copy, to retrieve the characters of the string.
# Aliasing issues is the responsibility of the caller.
- private init full(items: NativeString, bytelen, from, length: Int)
+ private new full(items: NativeString, bytelen, from, length: Int)
do
- self._items = items
- self._length = length
- self._bytelen = bytelen
- _first_byte = from
- _bytepos = from
+ if bytelen == length then return new ASCIIFlatString.full_data(items, bytelen, from, length)
+ return new UnicodeFlatString.full_data(items, bytelen, from, length)
end
redef fun ==(other)
return new FlatString.full(ns, new_bytelen, 0, newlen)
end
-
redef fun hash
do
if hash_cache == null then
redef fun substrings do return new FlatSubstringsIter(self)
end
+# Regular Nit UTF-8 strings
+private class UnicodeFlatString
+ super FlatString
+
+ init full_data(items: NativeString, bytelen, from, length: Int) do
+ self._items = items
+ self._length = length
+ self._bytelen = bytelen
+ _first_byte = from
+ _bytepos = from
+ end
+
+ redef fun substring_from(from) do
+ if from >= self._length then return empty
+ if from <= 0 then return self
+ var c = char_to_byte_index(from)
+ var st = c - _first_byte
+ var fln = bytelen - st
+ return new FlatString.full(items, fln, c, _length - from)
+ end
+end
+
+# Special cases of String where all the characters are ASCII-based
+#
+# Optimizes access operations to O(1) complexity.
+private class ASCIIFlatString
+ super FlatString
+
+ init full_data(items: NativeString, bytelen, from, length: Int) do
+ self._items = items
+ self._length = length
+ self._bytelen = bytelen
+ _first_byte = from
+ _bytepos = from
+ end
+
+ redef fun [](idx) do
+ assert idx < _bytelen and idx >= 0
+ return _items[idx + _first_byte].ascii
+ end
+
+ redef fun substring(from, count) do
+ if count <= 0 then return ""
+
+ if from < 0 then
+ count += from
+ if count < 0 then return ""
+ from = 0
+ end
+ var ln = _length
+ if (count + from) > ln then count = ln - from
+ return new ASCIIFlatString.full_data(_items, count, from + _first_byte, count)
+ end
+
+ redef fun reversed do
+ var b = new FlatBuffer.with_capacity(_bytelen + 1)
+ var i = _length - 1
+ while i >= 0 do
+ b.add self[i]
+ i -= 1
+ end
+ var s = b.to_s.as(FlatString)
+ return s
+ end
+
+ redef fun char_to_byte_index(index) do return index + _first_byte
+
+ redef fun substring_impl(from, count, end_index) do
+ return new ASCIIFlatString.full_data(_items, count, from + _first_byte, count)
+ end
+
+ redef fun fetch_char_at(i) do return _items[i + _first_byte].ascii
+end
+
private class FlatStringCharReverseIterator
super IndexedIterator[Char]
from = 0
end
- var ln = length
+ var ln = _length
if (count + from) > ln then count = ln - from
if count <= 0 then return ""
var end_index = from + count - 1
var array_type = types.first
var typed_array
- if array_type == "FlatString" then
+ if array_type == "ASCIIFlatString" or array_type == "UnicodeFlatString" then
if has_nullable then
typed_array = new Array[nullable FlatString]
else typed_array = new Array[FlatString]
redef class Sys
- # Counts the number of allocations of FlatString
- var flatstr_allocations = 0
+ # Counts the number of allocations of UnicodeFlatString
+ var uniflatstr_allocations = 0
+
+ # Counts the number of allocations of ASCIIFlatString
+ var asciiflatstr_allocations = 0
# Counts the number of allocations of FlatBuffer
var flatbuf_allocations = 0
Allocations, by type:
"""
- print "\t-FlatString = {flatstr_allocations}"
+ print "\t-UnicodeFlatString = {uniflatstr_allocations}"
+ print "\t-ASCIIFlatString = {asciiflatstr_allocations}"
print "\t-FlatBuffer = {flatbuf_allocations}"
print "\t-Concat = {concat_allocations}"
print "\t-RopeBuffer = {ropebuf_allocations}"
print "Calls to length, by type:"
for k, v in length_calls do
printn "\t{k} = {v}"
- if k == "FlatString" then printn " (cache misses {length_cache_miss[k]}, {div(length_cache_miss[k] * 100, v)}%)"
+ if k == "UnicodeFlatString" then printn " (cache misses {length_cache_miss[k]}, {div(length_cache_miss[k] * 100, v)}%)"
printn "\n"
end
print "Indexed accesses, by type:"
print "Calls to first_byte on FlatString {first_byte_call}"
print "Calls to last_byte on FlatString {last_byte_call}"
- print "FlatStrings allocated with length {str_full_created} ({str_full_created.to_f/flatstr_allocations.to_f * 100.0 }%)"
-
print "Length of travel for index distribution:"
index_len.print_content
return super
end
- init do
- sys.flatstr_allocations += 1
- end
-
- redef init with_infos(items, bytelen, from)
- do
- self.items = items
- self.bytelen = bytelen
- sys.str_bytelen.inc bytelen
- first_byte = from
- length = items.utf8_length(from, bytelen)
- end
-
- redef init full(items, bytelen, from, length)
- do
- self.items = items
- self.length = length
- self.bytelen = bytelen
- sys.str_bytelen.inc bytelen
- sys.str_full_created += 1
- first_byte = from
- end
-
private var length_cache: nullable Int = null
redef fun length do
return super
end
end
+
+redef class ASCIIFlatString
+ redef init full_data(items, bytelen, from, length)
+ do
+ super
+ sys.asciiflatstr_allocations += 1
+ sys.str_full_created += 1
+ end
+end
+
+redef class UnicodeFlatString
+ redef init full_data(items, bytelen, from, length)
+ do
+ super
+ sys.uniflatstr_allocations += 1
+ sys.str_full_created += 1
+ end
+end
-FlatString
+ASCIIFlatString
Int
Test
Test
-FlatString
+ASCIIFlatString
Int
Test
Test
-FlatString
-FlatString
+ASCIIFlatString
+ASCIIFlatString
Class
Class
s isa Bytes
StringAB
537472696E674142
-s2 isa FlatString
+s2 isa UnicodeFlatString
String𐏓
s3 isa Bytes
StringA�
true
true
false
-s5 isa FlatString
+s5 isa UnicodeFlatString
String�
-s6 isa FlatString
+s6 isa ASCIIFlatString
\nStr\x00
-s7 isa FlatString
+s7 isa ASCIIFlatString
\nString66515\x41
-s8 isa FlatString
+s8 isa ASCIIFlatString
String66515A
s9 isa Regex
-<A i: <Int> s: <FlatString>>
+<A i: <Int> s: <ASCIIFlatString>>
i=5 s=Hello
i:5; s:Hello
true
true
-<B i: <Int> s: <FlatString> a: <A i: <Int> s: <FlatString>>>
+<B i: <Int> s: <ASCIIFlatString> a: <A i: <Int> s: <ASCIIFlatString>>>
i=100 s=World a=i:5; s:Hello
i:100; s:World; a:i:5; s:Hello
-<A i: <Int> s: <FlatString>>
+<A i: <Int> s: <ASCIIFlatString>>
i=5 s=Hello
-<A i: <Int> s: <FlatString>>
+<A i: <Int> s: <ASCIIFlatString>>
true
true
true
-<B i: <Int> s: <FlatString> a: <A i: <Int> s: <FlatString>>>
-i=100 s=World a=<A i: <Int> s: <FlatString>>
-<B i: <Int> s: <FlatString> a: <A i: <Int> s: <FlatString>>>
+<B i: <Int> s: <ASCIIFlatString> a: <A i: <Int> s: <ASCIIFlatString>>>
+i=100 s=World a=<A i: <Int> s: <ASCIIFlatString>>
+<B i: <Int> s: <ASCIIFlatString> a: <A i: <Int> s: <ASCIIFlatString>>>
true
true
-<A i: <Int> s: <FlatString>>
+<A i: <Int> s: <ASCIIFlatString>>
i=5 s=Hello
i:5; s:Hello
false
true
-<B i: <Int> s: <FlatString> a: <A i: <Int> s: <FlatString>>>
+<B i: <Int> s: <ASCIIFlatString> a: <A i: <Int> s: <ASCIIFlatString>>>
i=100 s=World a=i:5; s:Hello
i:100; s:World; a:i:5; s:Hello
-<A i: <Int> s: <FlatString>>
+<A i: <Int> s: <ASCIIFlatString>>
i=5 s=Hello
i:5; s:Hello
true
true
-<B i: <Int> s: <FlatString> a: <A i: <Int> s: <FlatString>>>
+<B i: <Int> s: <ASCIIFlatString> a: <A i: <Int> s: <ASCIIFlatString>>>
i=100 s=World
i:100; s:World
-<A i: <Int> s: <FlatString>>
+<A i: <Int> s: <ASCIIFlatString>>
i=5 s=Hello
i:5; s:Hello
true
true
-<B a: <A i: <Int> s: <FlatString>>>
+<B a: <A i: <Int> s: <ASCIIFlatString>>>
string=World a=i:5; s:Hello
string:World; a:i:5; s:Hello
# Nit: <JsonObject i:123, s:hello, f:123.456, a:[one,two], o:<null>>
# JSON: {"__class": "MyClass", "i": 123, "s": "hello", "f": 123.456, "a": ["one", "two"], "o": "Not the right type"}
-# Errors: 'Deserialization Error: Wrong type on `MyClass::o` expected `nullable MyClass`, got `FlatString`'
+# Errors: 'Deserialization Error: Wrong type on `MyClass::o` expected `nullable MyClass`, got `ASCIIFlatString`'
# Nit: <MyClass i:123 s:hello f:123.456 a:[one, two] o:<null>>
# JSON: not valid json
-FlatString
-FlatString
-Class[FlatString]
-Class[Class[FlatString]]
+ASCIIFlatString
+ASCIIFlatString
+Class[ASCIIFlatString]
+Class[Class[ASCIIFlatString]]
XObject
XObject
Allocations, by type:
- -FlatString = 29
+ -UnicodeFlatString = 0
+ -ASCIIFlatString = 30
-FlatBuffer = 2
-Concat = 0
-RopeBuffer = 0
Calls to length, by type:
- FlatString = 18 (cache misses 5, 27.77%)
+ FlatString = 18
Indexed accesses, by type:
- FlatString = 8
Calls to bytelen for each type:
- FlatString = 61
+ FlatString = 48
Calls to position for each type:
- FlatString = 17
Calls to bytepos for each type:
- FlatString = 9
-Calls to first_byte on FlatString 191
-Calls to last_byte on FlatString 19
-FlatStrings allocated with length 78 (86.813%)
+Calls to first_byte on FlatString 50
+Calls to last_byte on FlatString 10
Length of travel for index distribution:
-* 0 = 11 => occurences 73.333%, cumulative 73.333%
-* 1 = 8 => occurences 27.586%, cumulative 65.517%
Byte length of the FlatStrings created:
-* 0 = 6 => occurences 4.478%, cumulative 4.478%
-* 1 = 24 => occurences 16.216%, cumulative 20.27%
-* 2 = 30 => occurences 18.519%, cumulative 37.037%
-* 3 = 29 => occurences 16.384%, cumulative 50.282%
-* 4 = 3 => occurences 1.563%, cumulative 47.917%
-* 5 = 20 => occurences 9.662%, cumulative 54.106%
-* 6 = 26 => occurences 11.712%, cumulative 62.162%
-* 9 = 1 => occurences 0.422%, cumulative 58.65%
-* 10 = 9 => occurences 3.571%, cumulative 58.73%
-* 11 = 2 => occurences 0.749%, cumulative 56.18%
-* 12 = 1 => occurences 0.356%, cumulative 53.737%
-* 13 = 1 => occurences 0.34%, cumulative 51.701%
-* 14 = 1 => occurences 0.326%, cumulative 49.837%
-* 15 = 7 => occurences 2.188%, cumulative 50.0%
-* 16 = 5 => occurences 1.493%, cumulative 49.254%
-* 17 = 1 => occurences 0.286%, cumulative 47.429%
-* 25 = 2 => occurences 0.551%, cumulative 46.281%
-* 26 = 1 => occurences 0.265%, cumulative 44.828%
-* 31 = 2 => occurences 0.513%, cumulative 43.846%
-* 32 = 1 => occurences 0.248%, cumulative 42.574%
-* 33 = 1 => occurences 0.24%, cumulative 41.487%
-* 34 = 2 => occurences 0.465%, cumulative 40.698%
-* 35 = 1 => occurences 0.225%, cumulative 39.64%
-* 37 = 1 => occurences 0.219%, cumulative 38.731%
-* 39 = 1 => occurences 0.213%, cumulative 37.872%
-* 40 = 1 => occurences 0.207%, cumulative 37.06%
-* 43 = 1 => occurences 0.202%, cumulative 36.29%
-* 46 = 1 => occurences 0.196%, cumulative 35.56%
-* 51 = 14 => occurences 2.682%, cumulative 37.356%
-* 52 = 5 => occurences 0.931%, cumulative 37.244%