rename `NativeString` to `CString`
[nit.git] / lib / core / text / ropes.nit
1 # This file is part of NIT (http://www.nitlanguage.org).
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 # Tree-based representation of a String.
16 #
17 # Ropes are a data structure introduced in a 1995 paper from
18 # Hans J. Boehm, Russ Atkinson and Michael Plass.
19 #
20 # See:
21 #
22 # > Ropes: an Alternative to Strings,
23 # > *Software - Practice and Experience*,
24 # > Vol. 25(12), 1315-1330 (December 1995).
25 #
26 # The implementation developed here provides an automatic change
27 # of data structure depending on the length of the leaves.
28 #
29 # The length from which a `Rope` is built from a `flat` string
30 # if defined at top-level (see `maxlen`) and can be redefined by clients
31 # depending on their needs (e.g. if you need to bench the speed of
32 # the creation of concat nodes, lower the size of maxlen).
33 #
34 # A Rope as defined in the original paper is a Tree made of concatenation
35 # nodes and containing `Flat` (that is Array-based) strings as Leaves.
36 #
37 # Example :
38 #
39 # ~~~raw
40 # Concat
41 # / \
42 # Concat Concat
43 # / \ / \
44 # "My" " Name" " is" " Simon."
45 # ~~~
46 #
47 # Note that the above example is not representative of the actual implementation
48 # of `Ropes`, since short leaves are merged to keep the rope at an acceptable
49 # height (hence, this rope here might actually be a `FlatString` !).
50 module ropes
51
52 intrude import flat
53
54 # Maxlen is the maximum length of a Leaf node
55 #
56 # When concatenating two leaves, if `new_length` > `maxlen`,
57 # A `Concat` node is created instead
58 #
59 # Its purpose is to limit the depth of the `Rope` (this
60 # improves performance when accessing/iterating).
61 fun maxlen: Int do return 512
62
63 # String using a tree-based representation with leaves as `FlatStrings`
64 private abstract class Rope
65 super Text
66 end
67
68 # Node that represents a concatenation between two `String`
69 private class Concat
70 super Rope
71 super String
72
73 redef fun chars do return new RopeChars(self)
74
75 redef fun bytes do return new RopeBytes(self)
76
77 redef var length is noinit
78
79 redef var byte_length is noinit
80
81 redef fun substrings do return new RopeSubstrings.from(self, 0)
82
83 redef fun empty do return ""
84
85 # Cache for the latest accessed FlatString in `self`
86 var flat_cache: FlatString is noinit
87
88 # Position of the beginning of `flat_cache` in `self`
89 var flat_last_pos_start: Int is noinit
90
91 redef fun to_cstring do
92 var len = _byte_length
93 var ns = new CString(len + 1)
94 ns[len] = 0u8
95 var off = 0
96 for i in substrings do
97 var ilen = i._byte_length
98 i.as(FlatString)._items.copy_to(ns, ilen, i.as(FlatString)._first_byte, off)
99 off += ilen
100 end
101 return ns
102 end
103
104 # Left child of the node
105 var left: String
106 # Right child of the node
107 var right: String
108
109 init do
110 var l = _left
111 var r = _right
112 _length = l.length + r.length
113 _byte_length = l.byte_length + r.byte_length
114 _flat_last_pos_start = _length
115 end
116
117 redef fun is_empty do return _byte_length == 0
118
119 redef fun output do
120 _left.output
121 _right.output
122 end
123
124 redef fun iterator do return new RopeCharIterator.from(self, 0)
125
126 redef fun *(i) do
127 var x: String = self
128 for j in [1 .. i[ do x += self
129 return x
130 end
131
132 redef fun [](i) do
133 assert i >= 0 and i < _length
134 var flps = _flat_last_pos_start
135 if i >= flps then
136 var fc = _flat_cache
137 if i < flps + fc._length then return fc.fetch_char_at(i - flps)
138 end
139 var lf = get_leaf_at(i)
140 return lf.fetch_char_at(i - _flat_last_pos_start)
141 end
142
143 fun get_leaf_at(pos: Int): FlatString do
144 var flps = _flat_last_pos_start
145 if pos >= flps then
146 var fc = _flat_cache
147 if pos < flps + fc._length then return fc
148 end
149 var s: String = self
150 var st = pos
151 loop
152 if s isa FlatString then break
153 s = s.as(Concat)
154 var lft = s._left
155 var llen = lft.length
156 if pos >= llen then
157 s = s._right
158 pos -= llen
159 else
160 s = lft
161 end
162 end
163 _flat_last_pos_start = st - pos
164 _flat_cache = s
165 return s
166 end
167
168 redef fun substring(from, count) do
169 if from < 0 then
170 count += from
171 if count < 0 then return ""
172 from = 0
173 end
174
175 var ln = _length
176 if (count + from) > ln then count = ln - from
177 if count <= 0 then return ""
178 var end_index = from + count - 1
179
180 var flps = _flat_last_pos_start
181 if from >= flps then
182 var fc = _flat_cache
183 if end_index < flps + fc._length then
184 return fc.substring_impl(from - flps, count, end_index - flps)
185 end
186 end
187
188 var lft = _left
189 var llen = lft.length
190 if from < llen then
191 if from + count < llen then return lft.substring(from, count)
192 var lsublen = llen - from
193 return lft.substring_from(from) + _right.substring(0, count - lsublen)
194 else
195 return _right.substring(from - llen, count)
196 end
197 end
198
199 redef fun reversed do return new Concat(_right.reversed, _left.reversed)
200
201 redef fun insert_at(s, pos) do
202 var lft = _left
203 if pos > lft.length then
204 return lft + _right.insert_at(s, pos - lft.length)
205 end
206 return lft.insert_at(s, pos) + _right
207 end
208
209 redef fun to_upper do return new Concat(_left.to_upper, _right.to_upper)
210
211 redef fun to_lower do return new Concat(_left.to_lower, _right.to_lower)
212
213 redef fun +(o) do
214 var s = o.to_s
215 var slen = s.byte_length
216 if s isa Concat then
217 return new Concat(self, s)
218 else
219 var r = _right
220 var rlen = r.byte_length
221 if rlen + slen > maxlen then return new Concat(self, s)
222 return new Concat(_left, r + s)
223 end
224 end
225
226 redef fun copy_to_native(dest, n, src_offset, dest_offset) do
227 var l = _left
228 if src_offset < l.byte_length then
229 var lcopy = l.byte_length - src_offset
230 lcopy = if lcopy > n then n else lcopy
231 l.copy_to_native(dest, lcopy, src_offset, dest_offset)
232 dest_offset += lcopy
233 n -= lcopy
234 src_offset = 0
235 end
236 _right.copy_to_native(dest, n, src_offset, dest_offset)
237 end
238
239 # Returns a balanced version of `self`
240 fun balance: String do
241 var children = new Array[String]
242 var rnod: String
243 var iter: nullable RopeCharIteratorPiece = new RopeCharIteratorPiece(self, false, false, null)
244 loop
245 if iter == null then break
246 rnod = iter.node
247 if not rnod isa Concat then
248 children.push rnod
249 iter = iter.prev
250 continue
251 end
252 if not iter.ldone then
253 iter.ldone = true
254 iter = new RopeCharIteratorPiece(rnod._left, false, false, iter)
255 else if not iter.rdone then
256 iter.rdone = true
257 iter = new RopeCharIteratorPiece(rnod._right, false, false, iter)
258 else
259 iter = iter.prev
260 end
261
262 end
263 return recurse_balance(children, children.length)
264 end
265
266 fun recurse_balance(nodes: Array[String], len: Int): String do
267 var finpos = 0
268 var stpos = 0
269 while stpos < len do
270 if len - stpos > 1 then
271 nodes[finpos] = new Concat(nodes[stpos], nodes[stpos + 1])
272 stpos += 2
273 else
274 nodes[finpos] = nodes[stpos]
275 stpos += 1
276 end
277 finpos += 1
278 end
279 if finpos == 1 then return nodes[0]
280 return recurse_balance(nodes, finpos)
281 end
282 end
283
284 redef class FlatString
285
286 redef fun insert_at(s, pos) do
287 var l = substring(0, pos)
288 var r = substring_from(pos)
289 return l + s + r
290 end
291
292 redef fun +(o) do
293 var s = o.to_s
294 var slen = s.byte_length
295 var mlen = _byte_length
296 if slen == 0 then return self
297 if mlen == 0 then return s
298 var nlen = slen + mlen
299 if s isa FlatString then
300 if nlen > maxlen then return new Concat(self, s)
301 var mits = _items
302 var sifrom = s._first_byte
303 var mifrom = _first_byte
304 var sits = s._items
305 var ns = new CString(nlen + 1)
306 mits.copy_to(ns, mlen, mifrom, 0)
307 sits.copy_to(ns, slen, sifrom, mlen)
308 return new FlatString.full(ns, nlen, 0, length + s.length)
309 else if s isa Concat then
310 var sl = s._left
311 var sllen = sl.byte_length
312 if sllen + mlen > maxlen then return new Concat(self, s)
313 return new Concat(self + sl, s._right)
314 else
315 abort
316 end
317 end
318 end
319
320 # A simple linked list for use with iterators
321 private class RopeCharIteratorPiece
322 # The encapsulated node of the `Rope`
323 var node: String
324 # Was its _left child (if any) visited ?
325 var ldone: Bool
326 # Was its _right child (if any) visited ?
327 var rdone: Bool
328 # The previous node in the list.
329 var prev: nullable RopeCharIteratorPiece
330 end
331
332 # A reverse iterator capable of working with `Rope` objects
333 private class RopeByteReverseIterator
334 super IndexedIterator[Byte]
335
336 # Current CString
337 var ns: CString is noautoinit
338 # Current position in CString
339 var pns: Int is noautoinit
340 # Position in the Rope (0-indexed)
341 var pos: Int is noautoinit
342 # Iterator on the substrings, does the Postfix part of
343 # the Rope traversal.
344 var subs: IndexedIterator[FlatString] is noautoinit
345
346 init from(root: Concat, pos: Int) do
347 self.pos = pos
348 subs = new ReverseRopeSubstrings.from(root, pos)
349 var s = subs.item
350 ns = s._items
351 pns = pos - subs.index
352 end
353
354 redef fun index do return pos
355
356 redef fun is_ok do return pos >= 0
357
358 redef fun item do return ns[pns]
359
360 redef fun next do
361 pns -= 1
362 pos -= 1
363 if pns >= 0 then return
364 if not subs.is_ok then return
365 subs.next
366 if not subs.is_ok then return
367 var s = subs.item
368 ns = s._items
369 pns = s.last_byte
370 end
371 end
372
373 # Forward iterator on the bytes of a `Rope`
374 private class RopeByteIterator
375 super IndexedIterator[Byte]
376
377 # Position in current `String`
378 var pns: Int is noautoinit
379 # Current `String` being iterated on
380 var ns: CString is noautoinit
381 # Substrings of the Rope
382 var subs: IndexedIterator[FlatString] is noautoinit
383 # Maximum position to iterate on (e.g. Rope.byte_length)
384 var max: Int is noautoinit
385 # Position (char) in the Rope (0-indexed)
386 var pos: Int is noautoinit
387
388 init from(root: Concat, pos: Int) do
389 subs = new RopeSubstrings.from(root, pos)
390 pns = pos - subs.index
391 self.pos = pos
392 ns = subs.item._items
393 max = root.byte_length - 1
394 end
395
396 redef fun item do return ns[pns]
397
398 redef fun is_ok do return pos <= max
399
400 redef fun index do return pos
401
402 redef fun next do
403 pns += 1
404 pos += 1
405 if pns < subs.item._byte_length then return
406 if not subs.is_ok then return
407 subs.next
408 if not subs.is_ok then return
409 ns = subs.item._items
410 pns = 0
411 end
412 end
413
414
415 # A reverse iterator capable of working with `Rope` objects
416 private class RopeCharReverseIterator
417 super IndexedIterator[Char]
418
419 # Current CString
420 var ns: String is noautoinit
421 # Current position in CString
422 var pns: Int is noautoinit
423 # Position in the Rope (0-indexed)
424 var pos: Int is noautoinit
425 # Iterator on the substrings, does the Postfix part of
426 # the Rope traversal.
427 var subs: IndexedIterator[String] is noautoinit
428
429 init from(root: Concat, pos: Int) do
430 self.pos = pos
431 subs = new ReverseRopeSubstrings.from(root, pos)
432 ns = subs.item
433 pns = pos - subs.index
434 end
435
436 redef fun index do return pos
437
438 redef fun is_ok do return pos >= 0
439
440 redef fun item do return ns[pns]
441
442 redef fun next do
443 pns -= 1
444 pos -= 1
445 if pns >= 0 then return
446 if not subs.is_ok then return
447 subs.next
448 if not subs.is_ok then return
449 ns = subs.item
450 pns = ns.length - 1
451 end
452 end
453
454 # Forward iterator on the chars of a `Rope`
455 private class RopeCharIterator
456 super IndexedIterator[Char]
457
458 # Position in current `String`
459 var pns: Int is noautoinit
460 # Current `String` being iterated on
461 var str: String is noautoinit
462 # Substrings of the Rope
463 var subs: IndexedIterator[String] is noautoinit
464 # Maximum position to iterate on (e.g. Rope.length)
465 var max: Int is noautoinit
466 # Position (char) in the Rope (0-indexed)
467 var pos: Int is noautoinit
468
469 init from(root: Concat, pos: Int) do
470 subs = new RopeSubstrings.from(root, pos)
471 pns = pos - subs.index
472 self.pos = pos
473 str = subs.item
474 max = root.length - 1
475 end
476
477 redef fun item do return str[pns]
478
479 redef fun is_ok do return pos <= max
480
481 redef fun index do return pos
482
483 redef fun next do
484 pns += 1
485 pos += 1
486 if pns < subs.item.length then return
487 if not subs.is_ok then return
488 subs.next
489 if not subs.is_ok then return
490 str = subs.item
491 pns = 0
492 end
493 end
494
495 # Substrings of a Rope (i.e. Reverse postfix iterator on leaves)
496 private class ReverseRopeSubstrings
497 super IndexedIterator[FlatString]
498
499 # Visit Stack
500 var iter: RopeCharIteratorPiece is noinit
501 # Position in `Rope`
502 var pos: Int is noinit
503
504 # Current leaf
505 var str: FlatString is noinit
506
507 init from(root: Concat, pos: Int) do
508 var r = new RopeCharIteratorPiece(root, false, true, null)
509 var rnod: String = root
510 var off = pos
511 loop
512 if rnod isa Concat then
513 if off >= rnod._left.length then
514 off -= rnod._left.length
515 rnod = rnod._right
516 r = new RopeCharIteratorPiece(rnod, false, true, r)
517 else
518 r.ldone = true
519 rnod = rnod._left
520 r = new RopeCharIteratorPiece(rnod, false, true, r)
521 end
522 else
523 str = rnod.as(FlatString)
524 r.ldone = true
525 iter = r
526 self.pos = pos - off
527 break
528 end
529 end
530 end
531
532 redef fun item do return str
533
534 redef fun index do return pos
535
536 redef fun is_ok do return pos >= 0
537
538 redef fun next do
539 if pos < 0 then return
540 var curr = iter.prev
541 var currit = curr.as(not null).node
542 while curr != null do
543 currit = curr.node
544 if not currit isa Concat then
545 str = currit.as(FlatString)
546 pos -= str.length
547 iter = curr
548 return
549 end
550 if not curr.rdone then
551 curr.rdone = true
552 curr = new RopeCharIteratorPiece(currit._right, false, false, curr)
553 continue
554 end
555 if not curr.ldone then
556 curr.ldone = true
557 curr = new RopeCharIteratorPiece(currit._left, false, false, curr)
558 continue
559 end
560 curr = curr.prev
561 end
562 pos = -1
563 end
564 end
565
566 # Substrings of a Rope (i.e. Postfix iterator on leaves)
567 private class RopeSubstrings
568 super IndexedIterator[FlatString]
569
570 # Visit Stack
571 var iter: RopeCharIteratorPiece is noinit
572 # Position in `Rope`
573 var pos: Int is noinit
574 # Maximum position in `Rope` (i.e. length - 1)
575 var max: Int is noinit
576
577 # Current leaf
578 var str: FlatString is noinit
579
580 init from(root: Concat, pos: Int) do
581 var r = new RopeCharIteratorPiece(root, true, false, null)
582 max = root.length - 1
583 var rnod: String = root
584 var off = pos
585 loop
586 if rnod isa Concat then
587 if off >= rnod._left.length then
588 r.rdone = true
589 off -= rnod._left.length
590 rnod = rnod._right
591 r = new RopeCharIteratorPiece(rnod, true, false, r)
592 else
593 rnod = rnod._left
594 r = new RopeCharIteratorPiece(rnod, true, false, r)
595 end
596 else
597 str = rnod.as(FlatString)
598 r.rdone = true
599 iter = r
600 self.pos = pos - off
601 break
602 end
603 end
604 end
605
606 redef fun item do return str
607
608 redef fun is_ok do return pos <= max
609
610 redef fun index do return pos
611
612 redef fun next do
613 pos += str.length
614 if pos > max then return
615 var it = iter.prev.as(not null)
616 var rnod = it.node
617 loop
618 if not rnod isa Concat then
619 it.ldone = true
620 it.rdone = true
621 str = rnod.as(FlatString)
622 iter = it
623 break
624 end
625 if not it.ldone then
626 rnod = rnod._left
627 it.ldone = true
628 it = new RopeCharIteratorPiece(rnod, false, false, it)
629 else if not it.rdone then
630 it.rdone = true
631 rnod = rnod._right
632 it = new RopeCharIteratorPiece(rnod, false, false, it)
633 else
634 it = it.prev.as(not null)
635 rnod = it.node
636 continue
637 end
638 end
639 end
640 end
641
642 # Implementation of a `StringCharView` for `Concat` objects
643 private class RopeChars
644 super StringCharView
645
646 redef type SELFTYPE: Concat
647
648 redef fun [](i) do
649 return target[i]
650 end
651
652 redef fun iterator_from(i) do return new RopeCharIterator.from(target, i)
653
654 redef fun reverse_iterator_from(i) do return new RopeCharReverseIterator.from(target, i)
655
656 end
657
658 # Implementation of a `StringCharView` for `Concat` objects
659 private class RopeBytes
660 super StringByteView
661
662 redef type SELFTYPE: Concat
663
664 var cache: FlatString is noinit
665
666 var cache_start: Int = -1
667
668 var cache_end: Int = -1
669
670 redef fun [](i) do
671 assert i >= 0 and i < target._byte_length
672 var flps = _cache_start
673 if i >= flps and i <= _cache_end then
674 return _cache.bytes[i - flps]
675 end
676 var lf = get_leaf_at(i)
677 return lf.bytes[i - _cache_start]
678 end
679
680 fun get_leaf_at(pos: Int): FlatString do
681 var flps = _cache_start
682 if pos >= flps and pos <= _cache_end then
683 return _cache
684 end
685 var s: String = target
686 var st = pos
687 loop
688 if s isa FlatString then break
689 s = s.as(Concat)
690 var lft = s._left
691 var llen = lft.byte_length
692 if pos >= llen then
693 s = s._right
694 pos -= llen
695 else
696 s = lft
697 end
698 end
699 _cache_start = st - pos
700 _cache_end = st - pos + s.byte_length - 1
701 _cache = s
702 return s
703 end
704
705 redef fun iterator_from(i) do return new RopeByteIterator.from(target, i)
706
707 redef fun reverse_iterator_from(i) do return new RopeByteReverseIterator.from(target, i)
708
709 end