df984333b4f0c0d4237aae738a2cd476b8093be9
[nit.git] / lib / standard / ropes.nit
1 # This file is part of NIT (http://www.nitlanguage.org).
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 # Tree-based representation of a String.
16 #
17 # Ropes are a data structure introduced in a 1995 paper from
18 # Hans J. Boehm, Russ Atkinson and Michael Plass.
19 # See : `Ropes : an Alternative to Strings`, `Software - Practice and Experience,
20 # Vol. 25(12), 1315-1330 (December 1995)`.
21 #
22 # The implementation developed here provides an automatic change
23 # of data structure depending on the length of the leaves.
24 #
25 # The length from which a `Rope` is built from a `flat` string
26 # if defined at top-level (see `maxlen`) and can be redefined by clients
27 # depending on their needs (e.g. if you need to bench the speed of
28 # the creation of concat nodes, lower the size of maxlen).
29 #
30 # A Rope as defined in the original paper is a Tree made of concatenation
31 # nodes and containing `Flat` (that is Array-based) strings as Leaves.
32 #
33 # Example :
34 #
35 # ` Concat `
36 # ` / \ `
37 # ` Concat Concat `
38 # ` / \ / \ `
39 # `"My" " Name" " is" " Simon." `
40 #
41 # Note that the above example is not representative of the actual implementation
42 # of `Ropes`, since short leaves are merged to keep the rope at an acceptable
43 # height (hence, this rope here might actually be a `FlatString` !).
44 module ropes
45
46 intrude import string
47
48 # Maxlen is the maximum length of a Leaf node
49 #
50 # When concatenating two leaves, if `new_length` > `maxlen`,
51 # A `Concat` node is created instead
52 #
53 # Its purpose is to limit the depth of the `Rope` (this
54 # improves performance when accessing/iterating).
55 fun maxlen: Int do return 64
56
57 # String using a tree-based representation with leaves as `FlatStrings`
58 private abstract class Rope
59 super Text
60 end
61
62 private abstract class RopeString
63 super Rope
64 super String
65
66 redef fun chars is cached do return new RopeChars(self)
67 end
68
69 # Node that represents a concatenation between two `String`
70 private class Concat
71 super RopeString
72
73 redef var length: Int
74
75 redef fun substrings do return new RopeSubstrings(self)
76
77 redef fun empty do return ""
78
79 redef fun to_cstring is cached do
80 var len = length
81 var ns = new NativeString(len + 1)
82 ns[len] = '\0'
83 var off = 0
84 for i in substrings do
85 var ilen = i.length
86 i.as(FlatString).items.copy_to(ns, ilen, i.as(FlatString).index_from, off)
87 off += ilen
88 end
89 return ns
90 end
91
92 # Left child of the node
93 var left: String
94 # Right child of the node
95 var right: String
96
97 init(l: String, r: String) is old_style_init do
98 left = l
99 right = r
100 length = l.length + r.length
101 end
102
103 redef fun output do
104 left.output
105 right.output
106 end
107
108 redef fun iterator do return new RopeIter(self)
109
110 redef fun *(i) do
111 var x: String = self
112 for j in [1 .. i[ do x += self
113 return x
114 end
115
116 redef fun [](i) do
117 var llen = left.length
118 if i >= llen then return right[i - llen]
119 return left[i]
120 end
121
122 redef fun substring(from, len) do
123 var llen = left.length
124 if from < llen then
125 if from + len < llen then return left.substring(from,len)
126 var lsublen = llen - from
127 return left.substring_from(from) + right.substring(0, len - lsublen)
128 else
129 return right.substring(from - llen, len)
130 end
131 end
132
133 redef fun reversed do return new Concat(right.reversed, left.reversed)
134
135 redef fun insert_at(s, pos) do
136 if pos > left.length then
137 return left + right.insert_at(s, pos - left.length)
138 end
139 return left.insert_at(s, pos) + right
140 end
141
142 redef fun to_upper do return new Concat(left.to_upper, right.to_upper)
143
144 redef fun to_lower do return new Concat(left.to_lower, right.to_lower)
145
146 redef fun +(o) do
147 var s = o.to_s
148 var slen = s.length
149 if s isa Concat then
150 return new Concat(self, s)
151 else
152 var r = right
153 var rlen = r.length
154 if rlen + slen > maxlen then return new Concat(left, new Concat(r, s))
155 return new Concat(left, r + s)
156 end
157 end
158 end
159
160 # Mutable `Rope`, optimized for concatenation operations
161 #
162 # A `RopeBuffer` is an efficient way of building a `String` when
163 # concatenating small strings.
164 #
165 # It does concatenations in an optimized way by having a
166 # mutable part and an immutable part built by efficiently
167 # concatenating strings in chain.
168 #
169 # Every concatenation operation is done by copying a string to
170 # the mutable part and flushing it when full.
171 #
172 # However, when a long string is appended to the `Buffer`,
173 # the concatenation is done at it would be in a `Rope`.
174 class RopeBuffer
175 super Rope
176 super Buffer
177
178 redef fun chars: Sequence[Char] is cached do return new RopeBufferChars(self)
179
180 # The final string being built on the fly
181 private var str: String is noinit
182
183 # Current concatenation buffer
184 private var ns: NativeString is noinit
185
186 # Next available (e.g. unset) character in the `Buffer`
187 private var rpos = 0
188
189 # Keeps track of the buffer's currently dumped part
190 #
191 # This might happen if for instance, a String was being
192 # built by concatenating small parts of string and suddenly
193 # a long string (length > maxlen) is appended.
194 private var dumped: Int is noinit
195
196 # Length of the complete rope
197 redef var length = 0
198
199 # Length of the mutable part
200 #
201 # Is also used as base to compute the size of the next
202 # mutable native string (`ns`)
203 private var buf_size: Int is noinit
204
205 redef fun substrings: Iterator[String] do return new RopeBufSubstringIterator(self)
206
207 # Builds an empty `RopeBuffer`
208 init do
209 str = ""
210 ns = new NativeString(maxlen)
211 buf_size = maxlen
212 dumped = 0
213 end
214
215 # Builds a new `RopeBuffer` with `str` in it.
216 init from(str: String) do
217 self.str = str
218 ns = new NativeString(maxlen)
219 buf_size = maxlen
220 length = str.length
221 dumped = 0
222 end
223
224 # Resets the informations of the `Buffer`
225 #
226 # This is called when doing in-place modifications
227 # on a previously to_s'd `RopeBuffer`
228 private fun reset do
229 var nns = new NativeString(buf_size)
230 var blen = rpos - dumped
231 ns.copy_to(nns, blen, dumped, 0)
232 dumped = 0
233 rpos = blen
234 written = false
235 end
236
237 redef fun empty do return new RopeBuffer
238
239 redef fun clear do
240 str = ""
241 length = 0
242 rpos = 0
243 dumped = 0
244 end
245
246 redef fun substring(from, count) do
247 var strlen = str.length
248
249 if from < 0 then
250 count += from
251 if count < 0 then count = 0
252 from = 0
253 end
254
255 if count > length then count = length - from
256
257 if count == 0 then return empty
258
259 if from < strlen then
260 var subpos = strlen - from
261 if count <= subpos then
262 return new RopeBuffer.from(str.substring(from, count))
263 else
264 var l = str.substring_from(from)
265 var rem = count - subpos
266 var nns = new NativeString(rem)
267 ns.copy_to(nns, rem, dumped, 0)
268 return new RopeBuffer.from(l + nns.to_s_with_length(rem))
269 end
270 else
271 var nns = new NativeString(count)
272 ns.copy_to(nns, count, dumped, 0)
273 return new RopeBuffer.from(nns.to_s_with_length(count))
274 end
275 end
276
277 redef fun append(s) do
278 var slen = s.length
279 length += slen
280 var rp = rpos
281 if s isa Rope or slen > maxlen then
282 if rp > 0 and dumped != rp then
283 str += new FlatString.with_infos(ns, rp - dumped, dumped, rp - 1)
284 dumped = rp
285 end
286 str = str + s
287 return
288 end
289 var remsp = buf_size - rp
290 var sits: NativeString
291 var begin: Int
292 if s isa FlatString then
293 begin = s.index_from
294 sits = s.items
295 else if s isa FlatBuffer then
296 begin = 0
297 sits = s.items
298 else
299 if slen <= remsp then
300 for i in s.chars do
301 ns[rpos] = i
302 rpos += 1
303 end
304 else
305 var spos = 0
306 for i in [0..remsp[ do
307 ns[rpos] = s[spos]
308 rpos += 1
309 spos += 1
310 end
311 dump_buffer
312 while spos < slen do
313 ns[rpos] = s[spos]
314 spos += 1
315 rpos += 1
316 end
317 end
318 return
319 end
320 if slen <= remsp then
321 if remsp <= 0 then
322 dump_buffer
323 rpos = 0
324 else
325 sits.copy_to(ns, slen, begin, rp)
326 rpos += slen
327 end
328 else
329 sits.copy_to(ns, remsp, begin, rp)
330 rpos = buf_size
331 dump_buffer
332 var nlen = slen - remsp
333 sits.copy_to(ns, nlen, begin + remsp, 0)
334 rpos = nlen
335 end
336 end
337
338 redef fun add(c) do
339 var rp = rpos
340 if rp >= buf_size then
341 dump_buffer
342 rp = 0
343 end
344 ns[rp] = c
345 rp += 1
346 length += 1
347 rpos = rp
348 end
349
350 # Converts the Buffer to a FlatString, appends it to
351 # the final String and re-allocates a new larger Buffer.
352 private fun dump_buffer do
353 written = false
354 var nstr = new FlatString.with_infos(ns, rpos - dumped, dumped, rpos - 1)
355 str += nstr
356 var bs = buf_size
357 bs = bs * 2
358 ns = new NativeString(bs)
359 buf_size = bs
360 dumped = 0
361 end
362
363 redef fun output do
364 str.output
365 new FlatString.with_infos(ns, rpos - dumped, dumped, rpos - 1).output
366 end
367
368 # Enlarge is useless here since the `Buffer`
369 # part is automatically dumped when necessary.
370 #
371 # Also, since the buffer can not be overused by a
372 # single string, there is no need for manual
373 # resizing.
374 #
375 # "You have no power here !"
376 redef fun enlarge(i) do end
377
378 redef fun to_s do
379 written = true
380 var nnslen = rpos - dumped
381 if nnslen == 0 then return str
382 return str + new FlatString.with_infos(ns, rpos - dumped, dumped, rpos - 1)
383 end
384
385 redef fun reverse do
386 # Flush the buffer in order to only have to reverse `str`.
387 if rpos > 0 and dumped != rpos then
388 str += new FlatString.with_infos(ns, rpos - dumped, dumped, rpos - 1)
389 dumped = rpos
390 end
391 str = str.reversed
392 end
393
394 redef fun upper do
395 if written then reset
396 str = str.to_upper
397 var mits = ns
398 for i in [0 .. rpos[ do
399 mits[i] = mits[i].to_upper
400 end
401 end
402
403 redef fun lower do
404 if written then reset
405 str = str.to_lower
406 var mits = ns
407 for i in [0 .. rpos[ do
408 mits[i] = mits[i].to_lower
409 end
410 end
411 end
412
413 redef class FlatString
414
415 redef fun insert_at(s, pos) do
416 var l = substring(0, pos)
417 var r = substring_from(pos)
418 return l + s + r
419 end
420
421 redef fun +(o) do
422 var s = o.to_s
423 var slen = s.length
424 var mlen = length
425 if slen == 0 then return self
426 if mlen == 0 then return s
427 var nlen = slen + mlen
428 if s isa FlatString then
429 if nlen > maxlen then return new Concat(self, s)
430 var mits = items
431 var sifrom = s.index_from
432 var mifrom = index_from
433 var sits = s.items
434 var ns = new NativeString(nlen + 1)
435 mits.copy_to(ns, mlen, mifrom, 0)
436 sits.copy_to(ns, slen, sifrom, mlen)
437 return ns.to_s_with_length(nlen)
438 else if s isa Concat then
439 var sl = s.left
440 var sllen = sl.length
441 if sllen + mlen > maxlen then return new Concat(self, s)
442 return new Concat(self + sl, s.right)
443 else
444 abort
445 end
446 end
447 end
448
449 # A simple linked list for use with iterators
450 private class RopeIterPiece
451 # The encapsulated node of the `Rope`
452 var node: String
453 # Was its left child (if any) visited ?
454 var ldone: Bool
455 # Was its right child (if any) visited ?
456 var rdone: Bool
457 # The previous node in the list.
458 var prev: nullable RopeIterPiece
459 end
460
461 # A reverse iterator capable of working with `Rope` objects
462 private class RopeReviter
463 super IndexedIterator[Char]
464
465 # Current NativeString
466 var ns: String
467 # Current position in NativeString
468 var pns: Int
469 # Position in the Rope (0-indexed)
470 var pos: Int
471 # Iterator on the substrings, does the Postfix part of
472 # the Rope traversal.
473 var subs: IndexedIterator[String]
474
475 init(root: RopeString) is old_style_init do
476 pos = root.length - 1
477 subs = new ReverseRopeSubstrings(root)
478 ns = subs.item
479 pns = ns.length - 1
480 end
481
482 init from(root: RopeString, pos: Int) do
483 self.pos = pos
484 subs = new ReverseRopeSubstrings.from(root, pos)
485 ns = subs.item
486 pns = pos - subs.index
487 end
488
489 redef fun index do return pos
490
491 redef fun is_ok do return pos >= 0
492
493 redef fun item do return ns[pns]
494
495 redef fun next do
496 pns -= 1
497 pos -= 1
498 if pns >= 0 then return
499 if not subs.is_ok then return
500 subs.next
501 if not subs.is_ok then return
502 ns = subs.item
503 pns = ns.length - 1
504 end
505 end
506
507 # Forward iterator on the chars of a `Rope`
508 private class RopeIter
509 super IndexedIterator[Char]
510
511 # Position in current `String`
512 var pns: Int
513 # Current `String` being iterated on
514 var str: String
515 # Substrings of the Rope
516 var subs: IndexedIterator[String]
517 # Maximum position to iterate on (e.g. Rope.length)
518 var max: Int
519 # Position (char) in the Rope (0-indexed)
520 var pos: Int
521
522 init(root: RopeString) is old_style_init do
523 subs = new RopeSubstrings(root)
524 pns = 0
525 str = subs.item
526 max = root.length - 1
527 pos = 0
528 end
529
530 init from(root: RopeString, pos: Int) do
531 subs = new RopeSubstrings.from(root, pos)
532 pns = pos - subs.index
533 self.pos = pos
534 str = subs.item
535 max = root.length - 1
536 end
537
538 redef fun item do return str[pns]
539
540 redef fun is_ok do return pos <= max
541
542 redef fun index do return pos
543
544 redef fun next do
545 pns += 1
546 pos += 1
547 if pns < subs.item.length then return
548 if not subs.is_ok then return
549 subs.next
550 if not subs.is_ok then return
551 str = subs.item
552 pns = 0
553 end
554 end
555
556 # Substrings of a Rope (i.e. Reverse postfix iterator on leaves)
557 private class ReverseRopeSubstrings
558 super IndexedIterator[String]
559
560 # Visit Stack
561 var iter: RopeIterPiece is noinit
562 # Position in `Rope`
563 var pos: Int is noinit
564
565 # Current leaf
566 var str: String is noinit
567
568 init(root: RopeString) is old_style_init do
569 var r = new RopeIterPiece(root, false, true, null)
570 pos = root.length - 1
571 var lnod: String = root
572 loop
573 if lnod isa Concat then
574 lnod = lnod.right
575 r = new RopeIterPiece(lnod, false, true, r)
576 else
577 str = lnod
578 iter = r
579 break
580 end
581 end
582 end
583
584 init from(root: RopeString, pos: Int) do
585 var r = new RopeIterPiece(root, false, true, null)
586 var rnod: String = root
587 var off = pos
588 loop
589 if rnod isa Concat then
590 if off >= rnod.left.length then
591 off -= rnod.left.length
592 rnod = rnod.right
593 r = new RopeIterPiece(rnod, false, true, r)
594 else
595 r.ldone = true
596 rnod = rnod.left
597 r = new RopeIterPiece(rnod, false, true, r)
598 end
599 else
600 str = rnod
601 r.ldone = true
602 iter = r
603 self.pos = pos - off
604 break
605 end
606 end
607 end
608
609 redef fun item do return str
610
611 redef fun index do return pos
612
613 redef fun is_ok do return pos >= 0
614
615 redef fun next do
616 if pos < 0 then return
617 var curr = iter.prev
618 var currit = curr.node
619 while curr != null do
620 currit = curr.node
621 if not currit isa Concat then
622 str = currit
623 pos -= str.length
624 iter = curr
625 return
626 end
627 if not curr.rdone then
628 curr.rdone = true
629 curr = new RopeIterPiece(currit.right, false, false, curr)
630 continue
631 end
632 if not curr.ldone then
633 curr.ldone = true
634 curr = new RopeIterPiece(currit.left, false, false, curr)
635 continue
636 end
637 curr = curr.prev
638 end
639 pos = -1
640 end
641 end
642
643 private class RopeBufSubstringIterator
644 super Iterator[String]
645
646 # Iterator on the substrings of the building string
647 var iter: Iterator[String]
648 # Makes a String out of the buffered part of the Ropebuffer
649 var nsstr: String
650 # Did we attain the buffered part ?
651 var nsstr_done = false
652
653 init(str: RopeBuffer) is old_style_init do
654 iter = str.str.substrings
655 nsstr = new FlatString.with_infos(str.ns, str.rpos - str.dumped, str.dumped, str.rpos - 1)
656 if str.length == 0 then nsstr_done = true
657 end
658
659 redef fun is_ok do return iter.is_ok or not nsstr_done
660
661 redef fun item do
662 assert is_ok
663 if iter.is_ok then return iter.item
664 return nsstr
665 end
666
667 redef fun next do
668 if iter.is_ok then
669 iter.next
670 return
671 end
672 nsstr_done = true
673 end
674 end
675
676 # Substrings of a Rope (i.e. Postfix iterator on leaves)
677 private class RopeSubstrings
678 super IndexedIterator[String]
679
680 # Visit Stack
681 var iter: RopeIterPiece is noinit
682 # Position in `Rope`
683 var pos: Int is noinit
684 # Maximum position in `Rope` (i.e. length - 1)
685 var max: Int is noinit
686
687 # Current leaf
688 var str: String is noinit
689
690 init(root: RopeString) is old_style_init do
691 var r = new RopeIterPiece(root, true, false, null)
692 pos = 0
693 max = root.length - 1
694 var rnod: String = root
695 loop
696 if rnod isa Concat then
697 rnod = rnod.left
698 r = new RopeIterPiece(rnod, true, false, r)
699 else
700 str = rnod
701 r.rdone = true
702 iter = r
703 break
704 end
705 end
706 end
707
708 init from(root: RopeString, pos: Int) do
709 var r = new RopeIterPiece(root, true, false, null)
710 max = root.length - 1
711 var rnod: String = root
712 var off = pos
713 loop
714 if rnod isa Concat then
715 if off >= rnod.left.length then
716 r.rdone = true
717 off -= rnod.left.length
718 rnod = rnod.right
719 r = new RopeIterPiece(rnod, true, false, r)
720 else
721 rnod = rnod.left
722 r = new RopeIterPiece(rnod, true, false, r)
723 end
724 else
725 str = rnod
726 r.rdone = true
727 iter = r
728 self.pos = pos - off
729 break
730 end
731 end
732 end
733
734 redef fun item do return str
735
736 redef fun is_ok do return pos <= max
737
738 redef fun index do return pos
739
740 redef fun next do
741 pos += str.length
742 if pos > max then return
743 var it = iter.prev
744 var rnod = it.node
745 loop
746 if not rnod isa Concat then
747 it.ldone = true
748 it.rdone = true
749 str = rnod
750 iter = it.as(not null)
751 break
752 end
753 if not it.ldone then
754 rnod = rnod.left
755 it.ldone = true
756 it = new RopeIterPiece(rnod, false, false, it)
757 else if not it.rdone then
758 it.rdone = true
759 rnod = rnod.right
760 it = new RopeIterPiece(rnod, false, false, it)
761 else
762 it = it.prev
763 rnod = it.node
764 continue
765 end
766 end
767 end
768 end
769
770 # Implementation of a `StringCharView` for `RopeString` objects
771 private class RopeChars
772 super StringCharView
773
774 var tgt: RopeString
775
776 init(s: RopeString) is old_style_init do tgt = s
777
778 redef fun [](i) do
779 return tgt[i]
780 end
781
782 redef fun iterator_from(i) do return new RopeIter.from(tgt, i)
783
784 redef fun reverse_iterator_from(i) do return new RopeReviter.from(tgt, i)
785
786 end
787
788 # An Iterator over a RopeBuffer.
789 class RopeBufferIter
790 super IndexedIterator[Char]
791
792 # Subiterator.
793 var sit: IndexedIterator[Char]
794
795 # Native string iterated over.
796 var ns: NativeString
797
798 # Current position in `ns`.
799 var pns: Int
800
801 # Maximum position iterable.
802 var maxpos: Int
803
804 redef var index: Int
805
806 # Init the iterator from a RopeBuffer.
807 init(t: RopeBuffer) is old_style_init do
808 ns = t.ns
809 maxpos = t.rpos
810 sit = t.str.chars.iterator
811 pns = t.dumped
812 index = 0
813 end
814
815 # Init the iterator from a RopeBuffer starting from `pos`.
816 init from(t: RopeBuffer, pos: Int) do
817 ns = t.ns
818 maxpos = t.length
819 sit = t.str.chars.iterator_from(pos)
820 pns = pos - t.str.length
821 index = pos
822 end
823
824 redef fun is_ok do return index < maxpos
825
826 redef fun item do
827 if sit.is_ok then return sit.item
828 return ns[pns]
829 end
830
831 redef fun next do
832 index += 1
833 if sit.is_ok then
834 sit.next
835 else
836 pns += 1
837 end
838 end
839 end
840
841 # Reverse iterator over a RopeBuffer.
842 class RopeBufferReviter
843 super IndexedIterator[Char]
844
845 # Subiterator.
846 var sit: IndexedIterator[Char]
847
848 # Native string iterated over.
849 var ns: NativeString
850
851 # Current position in `ns`.
852 var pns: Int
853
854 redef var index: Int
855
856 # Init the iterator from a RopeBuffer.
857 init(tgt: RopeBuffer) is old_style_init do
858 sit = tgt.str.chars.reverse_iterator
859 pns = tgt.rpos - 1
860 index = tgt.length - 1
861 ns = tgt.ns
862 end
863
864 # Init the iterator from a RopeBuffer starting from `pos`.
865 init from(tgt: RopeBuffer, pos: Int) do
866 sit = tgt.str.chars.reverse_iterator_from(pos - tgt.rpos - tgt.dumped)
867 pns = pos - tgt.str.length
868 index = pos
869 ns = tgt.ns
870 end
871
872 redef fun is_ok do return index > 0
873
874 redef fun item do
875 if pns >= 0 then return ns[pns]
876 return sit.item
877 end
878
879 redef fun next do
880 index -= 1
881 if pns >= 0 then
882 pns -= 1
883 else
884 sit.next
885 end
886 end
887 end
888
889 # View on the chars of a `RopeBuffer`
890 class RopeBufferChars
891 super BufferCharView
892
893 redef type SELFTYPE: RopeBuffer
894
895 redef fun [](i) do
896 if i < target.str.length then
897 return target.str[i]
898 else
899 return target.ns[i - target.str.length]
900 end
901 end
902
903 redef fun []=(i,c) do
904 if i == target.length then target.add c
905 if i < target.str.length then
906 var s = target.str
907 var l = s.substring(0, i)
908 var r = s.substring_from(i + 1)
909 target.str = l + c.to_s + r
910 else
911 target.ns[i - target.str.length] = c
912 end
913 end
914
915 redef fun add(c) do target.add c
916
917 redef fun push(c) do target.add c
918
919 redef fun iterator_from(i) do return new RopeBufferIter.from(target, i)
920
921 redef fun reverse_iterator_from(i) do return new RopeBufferReviter.from(target, i)
922 end