d41c0213cab68e9b47752f2432e2fa96986ce62a
[nit.git] / lib / standard / ropes.nit
1 # This file is part of NIT (http://www.nitlanguage.org).
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 # Tree-based representation of a String.
16 #
17 # Ropes are a data structure introduced in a 1995 paper from
18 # Hans J. Boehm, Russ Atkinson and Michael Plass.
19 # See : `Ropes : an Alternative to Strings`, `Software - Practice and Experience,
20 # Vol. 25(12), 1315-1330 (December 1995)`.
21 #
22 # The implementation developed here provides an automatic change
23 # of data structure depending on the length of the leaves.
24 #
25 # The length from which a `Rope` is built from a `flat` string
26 # if defined at top-level (see `maxlen`) and can be redefined by clients
27 # depending on their needs (e.g. if you need to bench the speed of
28 # the creation of concat nodes, lower the size of maxlen).
29 #
30 # A Rope as defined in the original paper is a Tree made of concatenation
31 # nodes and containing `Flat` (that is Array-based) strings as Leaves.
32 #
33 # Example :
34 #
35 # ` Concat `
36 # ` / \ `
37 # ` Concat Concat `
38 # ` / \ / \ `
39 # `"My" " Name" " is" " Simon." `
40 #
41 # Note that the above example is not representative of the actual implementation
42 # of `Ropes`, since short leaves are merged to keep the rope at an acceptable
43 # height (hence, this rope here might actually be a `FlatString` !).
44 module ropes
45
46 intrude import string
47
48 # Maxlen is the maximum length of a Leaf node
49 #
50 # When concatenating two leaves, if `new_length` > `maxlen`,
51 # A `Concat` node is created instead
52 #
53 # Its purpose is to limit the depth of the `Rope` (this
54 # improves performance when accessing/iterating).
55 fun maxlen: Int do return 64
56
57 # String using a tree-based representation with leaves as `FlatStrings`
58 private abstract class Rope
59 super Text
60 end
61
62 private abstract class RopeString
63 super Rope
64 super String
65
66 redef fun chars is cached do return new RopeChars(self)
67 end
68
69 # Node that represents a concatenation between two `String`
70 private class Concat
71 super RopeString
72
73 redef var length: Int
74
75 redef fun substrings do return new RopeSubstrings(self)
76
77 redef fun empty do return ""
78
79 redef fun to_cstring is cached do
80 var len = length
81 var ns = new NativeString(len + 1)
82 ns[len] = '\0'
83 var off = 0
84 for i in substrings do
85 var ilen = i.length
86 i.as(FlatString).items.copy_to(ns, ilen, i.as(FlatString).index_from, off)
87 off += ilen
88 end
89 return ns
90 end
91
92 # Left child of the node
93 var left: String
94 # Right child of the node
95 var right: String
96
97 init(l: String, r: String) is old_style_init do
98 left = l
99 right = r
100 length = l.length + r.length
101 end
102
103 redef fun output do
104 left.output
105 right.output
106 end
107
108 redef fun iterator do return new RopeIter(self)
109
110 redef fun *(i) do
111 var x: String = self
112 for j in [1 .. i[ do x += self
113 return x
114 end
115
116 redef fun [](i) do
117 var llen = left.length
118 if i >= llen then return right[i - llen]
119 return left[i]
120 end
121
122 redef fun substring(from, len) do
123 var llen = left.length
124 if from < llen then
125 if from + len < llen then return left.substring(from,len)
126 var lsublen = llen - from
127 return left.substring_from(from) + right.substring(0, len - lsublen)
128 else
129 return right.substring(from - llen, len)
130 end
131 end
132
133 redef fun reversed do return new Concat(right.reversed, left.reversed)
134
135 redef fun insert_at(s, pos) do
136 if pos > left.length then
137 return left + right.insert_at(s, pos - left.length)
138 end
139 return left.insert_at(s, pos) + right
140 end
141
142 redef fun to_upper do return new Concat(left.to_upper, right.to_upper)
143
144 redef fun to_lower do return new Concat(left.to_lower, right.to_lower)
145
146 redef fun +(o) do
147 var s = o.to_s
148 var slen = s.length
149 if s isa Concat then
150 return new Concat(self, s)
151 else
152 var r = right
153 var rlen = r.length
154 if rlen + slen > maxlen then return new Concat(left, new Concat(r, s))
155 return new Concat(left, r + s)
156 end
157 end
158 end
159
160 # Mutable `Rope`, optimized for concatenation operations
161 #
162 # A `RopeBuffer` is an efficient way of building a `String` when
163 # concatenating small strings.
164 #
165 # It does concatenations in an optimized way by having a
166 # mutable part and an immutable part built by efficiently
167 # concatenating strings in chain.
168 #
169 # Every concatenation operation is done by copying a string to
170 # the mutable part and flushing it when full.
171 #
172 # However, when a long string is appended to the `Buffer`,
173 # the concatenation is done at it would be in a `Rope`.
174 class RopeBuffer
175 super Rope
176 super Buffer
177
178 redef fun chars: Sequence[Char] is cached do return new RopeBufferChars(self)
179
180 # The final string being built on the fly
181 private var str: String is noinit
182
183 # Current concatenation buffer
184 private var ns: NativeString is noinit
185
186 # Next available (e.g. unset) character in the `Buffer`
187 private var rpos = 0
188
189 # Keeps track of the buffer's currently dumped part
190 #
191 # This might happen if for instance, a String was being
192 # built by concatenating small parts of string and suddenly
193 # a long string (length > maxlen) is appended.
194 private var dumped: Int is noinit
195
196 # Length of the complete rope
197 redef var length = 0
198
199 # Length of the mutable part
200 #
201 # Is also used as base to compute the size of the next
202 # mutable native string (`ns`)
203 private var buf_size: Int is noinit
204
205 redef fun substrings: Iterator[String] do return new RopeBufSubstringIterator(self)
206
207 # Builds an empty `RopeBuffer`
208 init do
209 str = ""
210 ns = new NativeString(maxlen)
211 buf_size = maxlen
212 dumped = 0
213 end
214
215 # Builds a new `RopeBuffer` with `str` in it.
216 init from(str: String) do
217 self.str = str
218 ns = new NativeString(maxlen)
219 buf_size = maxlen
220 length = str.length
221 dumped = 0
222 end
223
224 # Resets the informations of the `Buffer`
225 #
226 # This is called when doing in-place modifications
227 # on a previously to_s'd `RopeBuffer`
228 private fun reset do
229 var nns = new NativeString(buf_size)
230 var blen = rpos - dumped
231 ns.copy_to(nns, blen, dumped, 0)
232 dumped = 0
233 rpos = blen
234 written = false
235 end
236
237 redef fun empty do return new RopeBuffer
238
239 redef fun clear do
240 str = ""
241 length = 0
242 rpos = 0
243 dumped = 0
244 end
245
246 redef fun substring(from, count) do
247 var strlen = str.length
248
249 if from < 0 then
250 count += from
251 if count < 0 then count = 0
252 from = 0
253 end
254
255 if count > length then count = length - from
256
257 if count == 0 then return empty
258
259 if from < strlen then
260 var subpos = strlen - from
261 if count <= subpos then
262 return new RopeBuffer.from(str.substring(from, count))
263 else
264 var l = str.substring_from(from)
265 var rem = count - subpos
266 var nns = new NativeString(rem)
267 ns.copy_to(nns, rem, dumped, 0)
268 return new RopeBuffer.from(l + nns.to_s_with_length(rem))
269 end
270 else
271 var nns = new NativeString(count)
272 ns.copy_to(nns, count, dumped, 0)
273 return new RopeBuffer.from(nns.to_s_with_length(count))
274 end
275 end
276
277 redef fun append(s) do
278 var slen = s.length
279 length += slen
280 var rp = rpos
281 if s isa Rope or slen > maxlen then
282 if rp > 0 and dumped != rp then
283 str += new FlatString.with_infos(ns, rp - dumped, dumped, rp - 1)
284 dumped = rp
285 end
286 str = str + s
287 return
288 end
289 var remsp = buf_size - rp
290 var sits: NativeString
291 var begin: Int
292 if s isa FlatString then
293 begin = s.index_from
294 sits = s.items
295 else if s isa FlatBuffer then
296 begin = 0
297 sits = s.items
298 else
299 if slen <= remsp then
300 for i in s.chars do
301 ns[rpos] = i
302 rpos += 1
303 end
304 else
305 var spos = 0
306 for i in [0..remsp[ do
307 ns[rpos] = s[spos]
308 rpos += 1
309 spos += 1
310 end
311 dump_buffer
312 while spos < slen do
313 ns[rpos] = s[spos]
314 spos += 1
315 rpos += 1
316 end
317 end
318 return
319 end
320 if slen <= remsp then
321 if remsp <= 0 then
322 dump_buffer
323 rpos = 0
324 else
325 sits.copy_to(ns, slen, begin, rp)
326 rpos += slen
327 end
328 else
329 sits.copy_to(ns, remsp, begin, rp)
330 rpos = buf_size
331 dump_buffer
332 var nlen = slen - remsp
333 sits.copy_to(ns, nlen, begin + remsp, 0)
334 rpos = nlen
335 end
336 end
337
338 redef fun add(c) do
339 var rp = rpos
340 if rp >= buf_size then
341 dump_buffer
342 rp = 0
343 end
344 ns[rp] = c
345 rp += 1
346 length += 1
347 rpos = rp
348 end
349
350 # Converts the Buffer to a FlatString, appends it to
351 # the final String and re-allocates a new larger Buffer.
352 private fun dump_buffer do
353 written = false
354 var nstr = new FlatString.with_infos(ns, rpos - dumped, dumped, rpos - 1)
355 str += nstr
356 var bs = buf_size
357 bs = bs * 2
358 ns = new NativeString(bs)
359 buf_size = bs
360 dumped = 0
361 end
362
363 redef fun output do
364 str.output
365 new FlatString.with_infos(ns, rpos - dumped, dumped, rpos - 1).output
366 end
367
368 # Enlarge is useless here since the `Buffer`
369 # part is automatically dumped when necessary.
370 #
371 # Also, since the buffer can not be overused by a
372 # single string, there is no need for manual
373 # resizing.
374 #
375 # "You have no power here !"
376 redef fun enlarge(i) do end
377
378 redef fun to_s do
379 written = true
380 var nnslen = rpos - dumped
381 if nnslen == 0 then return str
382 return str + new FlatString.with_infos(ns, rpos - dumped, dumped, rpos - 1)
383 end
384
385 redef fun reverse do
386 str = str.reversed
387 var nns = new NativeString(buf_size)
388 var j = rpos
389 var mits = ns
390 for i in [0 .. rpos[ do
391 nns[i] = mits[j]
392 j -= 1
393 end
394 ns = nns
395 end
396
397 redef fun upper do
398 if written then reset
399 str = str.to_upper
400 var mits = ns
401 for i in [0 .. rpos[ do
402 mits[i] = mits[i].to_upper
403 end
404 end
405
406 redef fun lower do
407 if written then reset
408 str = str.to_lower
409 var mits = ns
410 for i in [0 .. rpos[ do
411 mits[i] = mits[i].to_lower
412 end
413 end
414 end
415
416 redef class FlatString
417
418 redef fun insert_at(s, pos) do
419 var l = substring(0, pos)
420 var r = substring_from(pos)
421 return l + s + r
422 end
423
424 redef fun +(o) do
425 var s = o.to_s
426 var slen = s.length
427 var mlen = length
428 if slen == 0 then return self
429 if mlen == 0 then return s
430 var nlen = slen + mlen
431 if s isa FlatString then
432 if nlen > maxlen then return new Concat(self, s)
433 var mits = items
434 var sifrom = s.index_from
435 var mifrom = index_from
436 var sits = s.items
437 var ns = new NativeString(nlen + 1)
438 mits.copy_to(ns, mlen, mifrom, 0)
439 sits.copy_to(ns, slen, sifrom, mlen)
440 return ns.to_s_with_length(nlen)
441 else if s isa Concat then
442 var sl = s.left
443 var sllen = sl.length
444 if sllen + mlen > maxlen then return new Concat(self, s)
445 return new Concat(self + sl, s.right)
446 else
447 abort
448 end
449 end
450 end
451
452 # A simple linked list for use with iterators
453 private class RopeIterPiece
454 # The encapsulated node of the `Rope`
455 var node: String
456 # Was its left child (if any) visited ?
457 var ldone: Bool
458 # Was its right child (if any) visited ?
459 var rdone: Bool
460 # The previous node in the list.
461 var prev: nullable RopeIterPiece
462 end
463
464 # A reverse iterator capable of working with `Rope` objects
465 private class RopeReviter
466 super IndexedIterator[Char]
467
468 # Current NativeString
469 var ns: String
470 # Current position in NativeString
471 var pns: Int
472 # Position in the Rope (0-indexed)
473 var pos: Int
474 # Iterator on the substrings, does the Postfix part of
475 # the Rope traversal.
476 var subs: IndexedIterator[String]
477
478 init(root: RopeString) is old_style_init do
479 pos = root.length - 1
480 subs = new ReverseRopeSubstrings(root)
481 ns = subs.item
482 pns = ns.length - 1
483 end
484
485 init from(root: RopeString, pos: Int) do
486 self.pos = pos
487 subs = new ReverseRopeSubstrings.from(root, pos)
488 ns = subs.item
489 pns = pos - subs.index
490 end
491
492 redef fun index do return pos
493
494 redef fun is_ok do return pos >= 0
495
496 redef fun item do return ns[pns]
497
498 redef fun next do
499 pns -= 1
500 pos -= 1
501 if pns >= 0 then return
502 if not subs.is_ok then return
503 subs.next
504 if not subs.is_ok then return
505 ns = subs.item
506 pns = ns.length - 1
507 end
508 end
509
510 # Forward iterator on the chars of a `Rope`
511 private class RopeIter
512 super IndexedIterator[Char]
513
514 # Position in current `String`
515 var pns: Int
516 # Current `String` being iterated on
517 var str: String
518 # Substrings of the Rope
519 var subs: IndexedIterator[String]
520 # Maximum position to iterate on (e.g. Rope.length)
521 var max: Int
522 # Position (char) in the Rope (0-indexed)
523 var pos: Int
524
525 init(root: RopeString) is old_style_init do
526 subs = new RopeSubstrings(root)
527 pns = 0
528 str = subs.item
529 max = root.length - 1
530 pos = 0
531 end
532
533 init from(root: RopeString, pos: Int) do
534 subs = new RopeSubstrings.from(root, pos)
535 pns = pos - subs.index
536 self.pos = pos
537 str = subs.item
538 max = root.length - 1
539 end
540
541 redef fun item do return str[pns]
542
543 redef fun is_ok do return pos <= max
544
545 redef fun index do return pos
546
547 redef fun next do
548 pns += 1
549 pos += 1
550 if pns < subs.item.length then return
551 if not subs.is_ok then return
552 subs.next
553 if not subs.is_ok then return
554 str = subs.item
555 pns = 0
556 end
557 end
558
559 # Substrings of a Rope (i.e. Reverse postfix iterator on leaves)
560 private class ReverseRopeSubstrings
561 super IndexedIterator[String]
562
563 # Visit Stack
564 var iter: RopeIterPiece is noinit
565 # Position in `Rope`
566 var pos: Int is noinit
567
568 # Current leaf
569 var str: String is noinit
570
571 init(root: RopeString) is old_style_init do
572 var r = new RopeIterPiece(root, false, true, null)
573 pos = root.length - 1
574 var lnod: String = root
575 loop
576 if lnod isa Concat then
577 lnod = lnod.right
578 r = new RopeIterPiece(lnod, false, true, r)
579 else
580 str = lnod
581 iter = r
582 break
583 end
584 end
585 end
586
587 init from(root: RopeString, pos: Int) do
588 var r = new RopeIterPiece(root, false, true, null)
589 var rnod: String = root
590 var off = pos
591 loop
592 if rnod isa Concat then
593 if off >= rnod.left.length then
594 off -= rnod.left.length
595 rnod = rnod.right
596 r = new RopeIterPiece(rnod, false, true, r)
597 else
598 r.ldone = true
599 rnod = rnod.left
600 r = new RopeIterPiece(rnod, false, true, r)
601 end
602 else
603 str = rnod
604 r.ldone = true
605 iter = r
606 self.pos = pos - off
607 break
608 end
609 end
610 end
611
612 redef fun item do return str
613
614 redef fun index do return pos
615
616 redef fun is_ok do return pos >= 0
617
618 redef fun next do
619 if pos < 0 then return
620 var curr = iter.prev
621 var currit = curr.node
622 while curr != null do
623 currit = curr.node
624 if not currit isa Concat then
625 str = currit
626 pos -= str.length
627 iter = curr
628 return
629 end
630 if not curr.rdone then
631 curr.rdone = true
632 curr = new RopeIterPiece(currit.right, false, false, curr)
633 continue
634 end
635 if not curr.ldone then
636 curr.ldone = true
637 curr = new RopeIterPiece(currit.left, false, false, curr)
638 continue
639 end
640 curr = curr.prev
641 end
642 pos = -1
643 end
644 end
645
646 private class RopeBufSubstringIterator
647 super Iterator[String]
648
649 # Iterator on the substrings of the building string
650 var iter: Iterator[String]
651 # Makes a String out of the buffered part of the Ropebuffer
652 var nsstr: String
653 # Did we attain the buffered part ?
654 var nsstr_done = false
655
656 init(str: RopeBuffer) is old_style_init do
657 iter = str.str.substrings
658 nsstr = new FlatString.with_infos(str.ns, str.rpos - str.dumped, str.dumped, str.rpos - 1)
659 if str.length == 0 then nsstr_done = true
660 end
661
662 redef fun is_ok do return iter.is_ok or not nsstr_done
663
664 redef fun item do
665 assert is_ok
666 if iter.is_ok then return iter.item
667 return nsstr
668 end
669
670 redef fun next do
671 if iter.is_ok then
672 iter.next
673 return
674 end
675 nsstr_done = true
676 end
677 end
678
679 # Substrings of a Rope (i.e. Postfix iterator on leaves)
680 private class RopeSubstrings
681 super IndexedIterator[String]
682
683 # Visit Stack
684 var iter: RopeIterPiece is noinit
685 # Position in `Rope`
686 var pos: Int is noinit
687 # Maximum position in `Rope` (i.e. length - 1)
688 var max: Int is noinit
689
690 # Current leaf
691 var str: String is noinit
692
693 init(root: RopeString) is old_style_init do
694 var r = new RopeIterPiece(root, true, false, null)
695 pos = 0
696 max = root.length - 1
697 var rnod: String = root
698 loop
699 if rnod isa Concat then
700 rnod = rnod.left
701 r = new RopeIterPiece(rnod, true, false, r)
702 else
703 str = rnod
704 r.rdone = true
705 iter = r
706 break
707 end
708 end
709 end
710
711 init from(root: RopeString, pos: Int) do
712 var r = new RopeIterPiece(root, true, false, null)
713 max = root.length - 1
714 var rnod: String = root
715 var off = pos
716 loop
717 if rnod isa Concat then
718 if off >= rnod.left.length then
719 r.rdone = true
720 off -= rnod.left.length
721 rnod = rnod.right
722 r = new RopeIterPiece(rnod, true, false, r)
723 else
724 rnod = rnod.left
725 r = new RopeIterPiece(rnod, true, false, r)
726 end
727 else
728 str = rnod
729 r.rdone = true
730 iter = r
731 self.pos = pos - off
732 break
733 end
734 end
735 end
736
737 redef fun item do return str
738
739 redef fun is_ok do return pos <= max
740
741 redef fun index do return pos
742
743 redef fun next do
744 pos += str.length
745 if pos > max then return
746 var it = iter.prev
747 var rnod = it.node
748 loop
749 if not rnod isa Concat then
750 it.ldone = true
751 it.rdone = true
752 str = rnod
753 iter = it.as(not null)
754 break
755 end
756 if not it.ldone then
757 rnod = rnod.left
758 it.ldone = true
759 it = new RopeIterPiece(rnod, false, false, it)
760 else if not it.rdone then
761 it.rdone = true
762 rnod = rnod.right
763 it = new RopeIterPiece(rnod, false, false, it)
764 else
765 it = it.prev
766 rnod = it.node
767 continue
768 end
769 end
770 end
771 end
772
773 # Implementation of a `StringCharView` for `RopeString` objects
774 private class RopeChars
775 super StringCharView
776
777 var tgt: RopeString
778
779 init(s: RopeString) is old_style_init do tgt = s
780
781 redef fun [](i) do
782 return tgt[i]
783 end
784
785 redef fun iterator_from(i) do return new RopeIter.from(tgt, i)
786
787 redef fun reverse_iterator_from(i) do return new RopeReviter.from(tgt, i)
788
789 end
790
791 # An Iterator over a RopeBuffer.
792 class RopeBufferIter
793 super IndexedIterator[Char]
794
795 # Subiterator.
796 var sit: IndexedIterator[Char]
797
798 # Native string iterated over.
799 var ns: NativeString
800
801 # Current position in `ns`.
802 var pns: Int
803
804 # Maximum position iterable.
805 var maxpos: Int
806
807 redef var index: Int
808
809 # Init the iterator from a RopeBuffer.
810 init(t: RopeBuffer) is old_style_init do
811 ns = t.ns
812 maxpos = t.rpos
813 sit = t.str.chars.iterator
814 pns = t.dumped
815 index = 0
816 end
817
818 # Init the iterator from a RopeBuffer starting from `pos`.
819 init from(t: RopeBuffer, pos: Int) do
820 ns = t.ns
821 maxpos = t.length
822 sit = t.str.chars.iterator_from(pos)
823 pns = pos - t.str.length
824 index = pos
825 end
826
827 redef fun is_ok do return index < maxpos
828
829 redef fun item do
830 if sit.is_ok then return sit.item
831 return ns[pns]
832 end
833
834 redef fun next do
835 index += 1
836 if sit.is_ok then
837 sit.next
838 else
839 pns += 1
840 end
841 end
842 end
843
844 # Reverse iterator over a RopeBuffer.
845 class RopeBufferReviter
846 super IndexedIterator[Char]
847
848 # Subiterator.
849 var sit: IndexedIterator[Char]
850
851 # Native string iterated over.
852 var ns: NativeString
853
854 # Current position in `ns`.
855 var pns: Int
856
857 redef var index: Int
858
859 # Init the iterator from a RopeBuffer.
860 init(tgt: RopeBuffer) is old_style_init do
861 sit = tgt.str.chars.reverse_iterator
862 pns = tgt.rpos - 1
863 index = tgt.length - 1
864 ns = tgt.ns
865 end
866
867 # Init the iterator from a RopeBuffer starting from `pos`.
868 init from(tgt: RopeBuffer, pos: Int) do
869 sit = tgt.str.chars.reverse_iterator_from(pos - tgt.rpos - tgt.dumped)
870 pns = pos - tgt.str.length
871 index = pos
872 ns = tgt.ns
873 end
874
875 redef fun is_ok do return index > 0
876
877 redef fun item do
878 if pns >= 0 then return ns[pns]
879 return sit.item
880 end
881
882 redef fun next do
883 index -= 1
884 if pns >= 0 then
885 pns -= 1
886 else
887 sit.next
888 end
889 end
890 end
891
892 # View on the chars of a `RopeBuffer`
893 class RopeBufferChars
894 super BufferCharView
895
896 redef type SELFTYPE: RopeBuffer
897
898 redef fun [](i) do
899 if i < target.str.length then
900 return target.str[i]
901 else
902 return target.ns[i - target.str.length]
903 end
904 end
905
906 redef fun []=(i,c) do
907 if i == target.length then target.add c
908 if i < target.str.length then
909 var s = target.str
910 var l = s.substring(0, i)
911 var r = s.substring_from(i + 1)
912 target.str = l + c.to_s + r
913 else
914 target.ns[i - target.str.length] = c
915 end
916 end
917
918 redef fun add(c) do target.add c
919
920 redef fun push(c) do target.add c
921
922 redef fun iterator_from(i) do return new RopeBufferIter.from(target, i)
923
924 redef fun reverse_iterator_from(i) do return new RopeBufferReviter.from(target, i)
925 end