This is an attempt to make text variants substantially lighter by adding cost to certain rare operations.
With this PR:
* `to_cstring` is not cached anymore, this induces more freedom on the FFI-side of things since the copy will be done on site, therefore the `char*` is now mutable on this side.
* `chars` and `bytes` are not attributes anymore, since they are only seldom used, they brought too much of a weight to the string envelope to justify such a spatial overhead
* the `is_dirty` flag was also removed from `FlatBuffer` since it no longer served any purpose
* the `flat_cache_last_pos` attribute is also removed from `Concat` and replaced by a direct access to the length of the underlying cached flat string
The sum of these removals make the envelope of a string lighter, from 104 bytes to 56 bytes.
This proves effective to reduce spatial use and (theoretically) reduces the pressure put on the garbage collector as well as improve caching (64 bytes per line, now a string envelope should fit in completely !).
As an example, passing the JSON parser on a 100 Mio string gives a memory use of:
* Before: 945 Mio
* After: 810 Mio
Pull-Request: #2183
Reviewed-by: Jean Privat <jean@pryen.org>
var was_slash = false
var i = 0
while i < length do
- var c = chars[i]
+ var c = self[i]
if not was_slash then
if c == '\\' then
was_slash = true
var res = new Buffer
var underscore = false
var start = 0
- var c = chars[0]
+ var c = self[0]
if c >= '0' and c <= '9' then
res.add('_')
start = 1
end
for i in [start..length[ do
- c = chars[i]
+ c = self[i]
if (c >= 'a' and c <= 'z') or (c >='A' and c <= 'Z') then
res.add(c)
underscore = false
redef type SELFTYPE: Buffer is fixed
- # Specific implementations MUST set this to `true` in order to invalidate caches
- protected var is_dirty = true
-
# Copy-On-Write flag
#
# If the `Buffer` was to_s'd, the next in-place altering
end
end
- redef fun hash
- do
- if is_dirty then hash_cache = null
- return super
- end
-
# In Buffers, the internal sequence of character is mutable
# Thus, `chars` can be used to modify the buffer.
redef fun chars: Sequence[Char] is abstract
# Index at which `self` begins in `_items`, inclusively
redef var first_byte is noinit
- redef var chars = new FlatStringCharView(self) is lazy
+ redef fun chars do return new FlatStringCharView(self)
- redef var bytes = new FlatStringByteView(self) is lazy
+ redef fun bytes do return new FlatStringByteView(self)
- redef var to_cstring is lazy do
+ redef fun to_cstring do
var blen = _byte_length
var new_items = new NativeString(blen + 1)
_items.copy_to(new_items, blen, _first_byte, 0)
super FlatText
super Buffer
- redef var chars: Sequence[Char] = new FlatBufferCharView(self) is lazy
+ redef fun chars do return new FlatBufferCharView(self)
- redef var bytes = new FlatBufferByteView(self) is lazy
-
- private var char_cache: Int = -1
-
- private var byte_cache: Int = -1
+ redef fun bytes do return new FlatBufferByteView(self)
private var capacity = 0
- # Real items, used as cache for when to_cstring is called
- private var real_items: NativeString is noinit
-
redef fun fast_cstring do return _items.fast_cstring(0)
redef fun substrings do return new FlatSubstringsIter(self)
do
assert index >= 0 and index <= _length
if written then reset
- is_dirty = true
if index == _length then
add item
return
redef fun add(c)
do
if written then reset
- is_dirty = true
var clen = c.u8char_len
var bt = _byte_length
enlarge(bt + clen)
end
redef fun clear do
- is_dirty = true
_byte_length = 0
_length = 0
if written then
redef fun to_cstring
do
- if is_dirty then
- var bln = _byte_length
- var new_native = new NativeString(bln + 1)
- new_native[bln] = 0u8
- if _length > 0 then _items.copy_to(new_native, bln, 0, 0)
- real_items = new_native
- is_dirty = false
- end
- return real_items
+ var bln = _byte_length
+ var new_native = new NativeString(bln + 1)
+ new_native[bln] = 0u8
+ if _length > 0 then _items.copy_to(new_native, bln, 0, 0)
+ return new_native
end
# Create a new empty string.
redef fun append(s)
do
if s.is_empty then return
- is_dirty = true
var sl = s.byte_length
var nln = _byte_length + sl
enlarge(nln)
copy_to(new_self, length, 0, 0)
var str = new FlatString.with_infos(new_self, length, 0)
new_self[length] = 0u8
- str.to_cstring = new_self
return str
end
super Rope
super String
- redef var chars is lazy do return new RopeChars(self)
+ redef fun chars do return new RopeChars(self)
- redef var bytes is lazy do return new RopeBytes(self)
+ redef fun bytes do return new RopeBytes(self)
redef var length is noinit
var flat_cache: FlatString is noinit
# Position of the beginning of `flat_cache` in `self`
- var flat_last_pos_start: Int = -1
+ var flat_last_pos_start: Int is noinit
- var flat_last_pos_end: Int = -1
-
- redef var to_cstring is lazy do
+ redef fun to_cstring do
var len = _byte_length
var ns = new NativeString(len + 1)
ns[len] = 0u8
init do
var l = _left
var r = _right
- length = l.length + r.length
+ _length = l.length + r.length
_byte_length = l.byte_length + r.byte_length
+ _flat_last_pos_start = _length
end
redef fun is_empty do return _byte_length == 0
end
redef fun [](i) do
- assert i >= 0 and i <= _length
+ assert i >= 0 and i < _length
var flps = _flat_last_pos_start
- if flps != -1 and i >= flps and i <= _flat_last_pos_end then
- return _flat_cache.fetch_char_at(i - flps)
+ if i >= flps then
+ var fc = _flat_cache
+ if i < flps + fc._length then return fc.fetch_char_at(i - flps)
end
var lf = get_leaf_at(i)
return lf.fetch_char_at(i - _flat_last_pos_start)
fun get_leaf_at(pos: Int): FlatString do
var flps = _flat_last_pos_start
- if flps != -1 and pos >= flps and pos <= _flat_last_pos_end then
- return _flat_cache
+ if pos >= flps then
+ var fc = _flat_cache
+ if pos < flps + fc._length then return fc
end
var s: String = self
var st = pos
end
end
_flat_last_pos_start = st - pos
- _flat_last_pos_end = st - pos + s.length - 1
_flat_cache = s
return s
end
var end_index = from + count - 1
var flps = _flat_last_pos_start
- if flps != -1 and from >= flps and end_index <= _flat_last_pos_end then
- return _flat_cache.substring_impl(from - flps, count, end_index - flps)
+ if from >= flps then
+ var fc = _flat_cache
+ if end_index < flps + fc._length then
+ return fc.substring_impl(from - flps, count, end_index - flps)
+ end
end
var lft = _left
super Rope
super Buffer
- redef var chars: Sequence[Char] is lazy do return new RopeBufferChars(self)
+ redef fun chars do return new RopeBufferChars(self)
- redef var bytes is lazy do return new RopeBufferBytes(self)
+ redef fun bytes do return new RopeBufferBytes(self)
# The final string being built on the fly
private var str: String = ""
var j = from
while j < n - m + 1 do
var i = m - 1 # Cursor in the pattern
- while i >= 0 and _motif.chars[i] == s.chars[i + j] do i -= 1
+ while i >= 0 and _motif[i] == s[i + j] do i -= 1
if i < 0 then
return j
else
var gs = _gs[i] # Good shift
- var bc = bc(s.chars[i+j]) - m + 1 + i # Bad char
+ var bc = bc(s[i+j]) - m + 1 + i # Bad char
# Both are true, do move to the best
if gs > bc then
j += gs
do
var stop = s.length
while from < stop do
- if s.chars[from] == self then return from
+ if s[from] == self then return from
from += 1
end
return -1
var stop = s.length - length + 1
while from < stop do
var i = length - 1
- while i >= 0 and self.chars[i] == s.chars[i + from] do i -= 1
+ while i >= 0 and self[i] == s[i + from] do i -= 1
# Test if we found
if i < 0 then return from
# Not found so try next one
source, tag, new Comm.world, new Status.ignore)
length = capacity
- is_dirty = true
end
redef fun recv_fill(mpi, dest, tag, comm) do recv(mpi, 0, capacity, dest, tag, comm)
--- /dev/null
+# This file is part of NIT ( http://www.nitlanguage.org ).
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import core
+intrude import core::text::ropes
+
+var lft = "Strings are either R"
+var rgt = "opes or FlatStrings!"
+
+var rp = new Concat(lft, rgt)
+
+print rp.length
+print rp.left.length
+print rp.right.length
+
+print rp[8]
+print rp.substring(12, 10)
--- /dev/null
+40
+20
+20
+a
+either Rop