12ad67e6e8c791a906887507596bef564d6c3e84
[nit.git] / lib / standard / ropes.nit
1 # This file is part of NIT (http://www.nitlanguage.org).
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 # Tree-based representation of a String.
16 #
17 # Ropes are a data structure introduced in a 1995 paper from
18 # Hans J. Boehm, Russ Atkinson and Michael Plass.
19 # See : `Ropes : an Alternative to Strings`, `Software - Practice and Experience,
20 # Vol. 25(12), 1315-1330 (December 1995)`.
21 #
22 # The implementation developed here provides an automatic change
23 # of data structure depending on the length of the leaves.
24 #
25 # The length from which a `Rope` is built from a `flat` string
26 # if defined at top-level (see `maxlen`) and can be redefined by clients
27 # depending on their needs (e.g. if you need to bench the speed of
28 # the creation of concat nodes, lower the size of maxlen).
29 #
30 # A Rope as defined in the original paper is a Tree made of concatenation
31 # nodes and containing `Flat` (that is Array-based) strings as Leaves.
32 #
33 # Example :
34 #
35 # ` Concat `
36 # ` / \ `
37 # ` Concat Concat `
38 # ` / \ / \ `
39 # `"My" " Name" " is" " Simon." `
40 #
41 # Note that the above example is not representative of the actual implementation
42 # of `Ropes`, since short leaves are merged to keep the rope at an acceptable
43 # height (hence, this rope here might actually be a `FlatString` !).
44 module ropes
45
46 intrude import string
47
48 # Maxlen is the maximum length of a Leaf node
49 #
50 # When concatenating two leaves, if `new_length` > `maxlen`,
51 # A `Concat` node is created instead
52 #
53 # Its purpose is to limit the depth of the `Rope` (this
54 # improves performance when accessing/iterating).
55 fun maxlen: Int do return 64
56
57 # String using a tree-based representation with leaves as `FlatStrings`
58 private abstract class Rope
59 super Text
60 end
61
62 private abstract class RopeString
63 super Rope
64 super String
65
66 redef fun chars is cached do return new RopeChars(self)
67 end
68
69 # Node that represents a concatenation between two `String`
70 private class Concat
71 super RopeString
72
73 redef var length: Int
74
75 redef fun substrings do return new RopeSubstrings(self)
76
77 redef fun empty do return ""
78
79 redef fun to_cstring is cached do
80 var len = length
81 var ns = new NativeString(len + 1)
82 ns[len] = '\0'
83 var off = 0
84 for i in substrings do
85 var ilen = i.length
86 i.as(FlatString).items.copy_to(ns, ilen, i.as(FlatString).index_from, off)
87 off += ilen
88 end
89 return ns
90 end
91
92 # Left child of the node
93 var left: String
94 # Right child of the node
95 var right: String
96
97 init(l: String, r: String) is old_style_init do
98 left = l
99 right = r
100 length = l.length + r.length
101 end
102
103 redef fun output do
104 left.output
105 right.output
106 end
107
108 redef fun iterator do return new RopeIter(self)
109
110 redef fun *(i) do
111 var x: String = self
112 for j in [1 .. i[ do x += self
113 return x
114 end
115
116 redef fun [](i) do
117 var llen = left.length
118 if i >= llen then return right[i - llen]
119 return left[i]
120 end
121
122 redef fun substring(from, len) do
123 var llen = left.length
124 if from < llen then
125 if from + len < llen then return left.substring(from,len)
126 var lsublen = llen - from
127 return left.substring_from(from) + right.substring(0, len - lsublen)
128 else
129 return right.substring(from - llen, len)
130 end
131 end
132
133 redef fun reversed do return new Concat(right.reversed, left.reversed)
134
135 redef fun insert_at(s, pos) do
136 if pos > left.length then
137 return left + right.insert_at(s, pos - left.length)
138 end
139 return left.insert_at(s, pos) + right
140 end
141
142 redef fun to_upper do return new Concat(left.to_upper, right.to_upper)
143
144 redef fun to_lower do return new Concat(left.to_lower, right.to_lower)
145
146 redef fun +(o) do
147 var s = o.to_s
148 var slen = s.length
149 if s isa Concat then
150 return new Concat(self, s)
151 else
152 var r = right
153 var rlen = r.length
154 if rlen + slen > maxlen then return new Concat(left, new Concat(r, s))
155 return new Concat(left, r + s)
156 end
157 end
158 end
159
160 # Mutable `Rope`, optimized for concatenation operations
161 #
162 # A `RopeBuffer` is an efficient way of building a `String` when
163 # concatenating small strings.
164 #
165 # It does concatenations in an optimized way by having a
166 # mutable part and an immutable part built by efficiently
167 # concatenating strings in chain.
168 #
169 # Every concatenation operation is done by copying a string to
170 # the mutable part and flushing it when full.
171 #
172 # However, when a long string is appended to the `Buffer`,
173 # the concatenation is done at it would be in a `Rope`.
174 class RopeBuffer
175 super Rope
176 super Buffer
177
178 redef fun chars: Sequence[Char] is cached do return new RopeBufferChars(self)
179
180 # The final string being built on the fly
181 private var str: String is noinit
182
183 # Current concatenation buffer
184 private var ns: NativeString is noinit
185
186 # Next available (e.g. unset) character in the `Buffer`
187 private var rpos = 0
188
189 # Keeps track of the buffer's currently dumped part
190 #
191 # This might happen if for instance, a String was being
192 # built by concatenating small parts of string and suddenly
193 # a long string (length > maxlen) is appended.
194 private var dumped: Int is noinit
195
196 # Length of the complete rope
197 redef var length = 0
198
199 # Length of the mutable part
200 #
201 # Is also used as base to compute the size of the next
202 # mutable native string (`ns`)
203 private var buf_size: Int is noinit
204
205 redef fun substrings: Iterator[String] do return new RopeBufSubstringIterator(self)
206
207 # Builds an empty `RopeBuffer`
208 init do
209 str = ""
210 ns = new NativeString(maxlen)
211 buf_size = maxlen
212 dumped = 0
213 end
214
215 # Builds a new `RopeBuffer` with `str` in it.
216 init from(str: String) do
217 self.str = str
218 ns = new NativeString(maxlen)
219 buf_size = maxlen
220 length = str.length
221 dumped = 0
222 end
223
224 # Resets the informations of the `Buffer`
225 #
226 # This is called when doing in-place modifications
227 # on a previously to_s'd `RopeBuffer`
228 private fun reset do
229 var nns = new NativeString(buf_size)
230 var blen = rpos - dumped
231 ns.copy_to(nns, blen, dumped, 0)
232 dumped = 0
233 rpos = blen
234 written = false
235 end
236
237 redef fun empty do return new RopeBuffer
238
239 redef fun clear do
240 str = ""
241 length = 0
242 rpos = 0
243 end
244
245 redef fun substring(from, count) do
246 var strlen = str.length
247
248 if from < 0 then
249 count += from
250 if count < 0 then count = 0
251 from = 0
252 end
253
254 if count > length then count = length - from
255
256 if count == 0 then return empty
257
258 if from < strlen then
259 var subpos = strlen - from
260 if count <= subpos then
261 return new RopeBuffer.from(str.substring(from, count))
262 else
263 var l = str.substring_from(from)
264 var rem = count - subpos
265 var nns = new NativeString(rem)
266 ns.copy_to(nns, rem, dumped, 0)
267 return new RopeBuffer.from(l + nns.to_s_with_length(rem))
268 end
269 else
270 var nns = new NativeString(count)
271 ns.copy_to(nns, count, dumped, 0)
272 return new RopeBuffer.from(nns.to_s_with_length(count))
273 end
274 end
275
276 redef fun append(s) do
277 var slen = s.length
278 length += slen
279 var rp = rpos
280 if s isa Rope then
281 if rp > 0 and dumped != rp then
282 str += new FlatString.with_infos(ns, rp - dumped, dumped, rp - 1)
283 dumped = rp
284 end
285 str = str + s
286 return
287 end
288 if slen > maxlen then
289 if rp > 0 and dumped != rp then
290 str += new FlatString.with_infos(ns, rp - dumped, dumped, rp - 1)
291 dumped = rp
292 end
293 str = str + s
294 return
295 end
296 var remsp = buf_size - rp
297 var sits: NativeString
298 var begin: Int
299 if s isa FlatString then
300 begin = s.index_from
301 sits = s.items
302 else if s isa FlatBuffer then
303 begin = 0
304 sits = s.items
305 else
306 if slen <= remsp then
307 for i in s.chars do
308 ns[rpos] = i
309 rpos += 1
310 end
311 else
312 var spos = 0
313 for i in [0..remsp[ do
314 ns[rpos] = s[spos]
315 rpos += 1
316 spos += 1
317 end
318 dump_buffer
319 while spos < slen do
320 ns[rpos] = s[spos]
321 spos += 1
322 rpos += 1
323 end
324 end
325 return
326 end
327 if slen <= remsp then
328 sits.copy_to(ns, slen, begin, rp)
329 if rp == buf_size then
330 rpos = buf_size
331 dump_buffer
332 rpos = 0
333 else
334 rpos += slen
335 end
336 else
337 sits.copy_to(ns, remsp, begin, rp)
338 rpos = buf_size
339 dump_buffer
340 var nlen = slen - remsp
341 sits.copy_to(ns, nlen, begin + remsp, 0)
342 rpos = nlen
343 end
344 end
345
346 redef fun add(c) do
347 var rp = rpos
348 length += 1
349 ns[rp] = c
350 rp += 1
351 if rp == buf_size then
352 rpos = rp
353 dump_buffer
354 rp = 0
355 end
356 rpos = rp
357 end
358
359 # Converts the Buffer to a FlatString, appends it to
360 # the final String and re-allocates a new larger Buffer.
361 private fun dump_buffer do
362 written = false
363 var nstr = new FlatString.with_infos(ns, rpos - dumped, dumped, rpos - 1)
364 str += nstr
365 var bs = buf_size
366 bs = bs * 2
367 ns = new NativeString(bs)
368 buf_size = bs
369 dumped = 0
370 end
371
372 redef fun output do
373 str.output
374 new FlatString.with_infos(ns, rpos - dumped, dumped, rpos - 1).output
375 end
376
377 # Enlarge is useless here since the `Buffer`
378 # part is automatically dumped when necessary.
379 #
380 # Also, since the buffer can not be overused by a
381 # single string, there is no need for manual
382 # resizing.
383 #
384 # "You have no power here !"
385 redef fun enlarge(i) do end
386
387 redef fun to_s do
388 written = true
389 var nnslen = rpos - dumped
390 if nnslen == 0 then return str
391 return str + new FlatString.with_infos(ns, rpos - dumped, dumped, rpos - 1)
392 end
393
394 redef fun reverse do
395 str = str.reversed
396 var nns = new NativeString(buf_size)
397 var j = rpos
398 var mits = ns
399 for i in [0 .. rpos[ do
400 nns[i] = mits[j]
401 j -= 1
402 end
403 ns = nns
404 end
405
406 redef fun upper do
407 if written then reset
408 str = str.to_upper
409 var mits = ns
410 for i in [0 .. rpos[ do
411 mits[i] = mits[i].to_upper
412 end
413 end
414
415 redef fun lower do
416 if written then reset
417 str = str.to_lower
418 var mits = ns
419 for i in [0 .. rpos[ do
420 mits[i] = mits[i].to_lower
421 end
422 end
423 end
424
425 redef class FlatString
426
427 redef fun insert_at(s, pos) do
428 var l = substring(0, pos)
429 var r = substring_from(pos)
430 return l + s + r
431 end
432
433 redef fun +(o) do
434 var s = o.to_s
435 var slen = s.length
436 var mlen = length
437 if slen == 0 then return self
438 if mlen == 0 then return s
439 var nlen = slen + mlen
440 if s isa FlatString then
441 if nlen > maxlen then return new Concat(self, s)
442 var mits = items
443 var sifrom = s.index_from
444 var mifrom = index_from
445 var sits = s.items
446 var ns = new NativeString(nlen + 1)
447 mits.copy_to(ns, mlen, mifrom, 0)
448 sits.copy_to(ns, slen, sifrom, mlen)
449 return ns.to_s_with_length(nlen)
450 else if s isa Concat then
451 var sl = s.left
452 var sllen = sl.length
453 if sllen + mlen > maxlen then return new Concat(self, s)
454 return new Concat(self + sl, s.right)
455 else
456 abort
457 end
458 end
459 end
460
461 # A simple linked list for use with iterators
462 private class RopeIterPiece
463 # The encapsulated node of the `Rope`
464 var node: String
465 # Was its left child (if any) visited ?
466 var ldone: Bool
467 # Was its right child (if any) visited ?
468 var rdone: Bool
469 # The previous node in the list.
470 var prev: nullable RopeIterPiece
471 end
472
473 # A reverse iterator capable of working with `Rope` objects
474 private class RopeReviter
475 super IndexedIterator[Char]
476
477 # Current NativeString
478 var ns: String
479 # Current position in NativeString
480 var pns: Int
481 # Position in the Rope (0-indexed)
482 var pos: Int
483 # Iterator on the substrings, does the Postfix part of
484 # the Rope traversal.
485 var subs: IndexedIterator[String]
486
487 init(root: RopeString) is old_style_init do
488 pos = root.length - 1
489 subs = new ReverseRopeSubstrings(root)
490 ns = subs.item
491 pns = ns.length - 1
492 end
493
494 init from(root: RopeString, pos: Int) do
495 self.pos = pos
496 subs = new ReverseRopeSubstrings.from(root, pos)
497 ns = subs.item
498 pns = pos - subs.index
499 end
500
501 redef fun index do return pos
502
503 redef fun is_ok do return pos >= 0
504
505 redef fun item do return ns[pns]
506
507 redef fun next do
508 pns -= 1
509 pos -= 1
510 if pns >= 0 then return
511 if not subs.is_ok then return
512 subs.next
513 if not subs.is_ok then return
514 ns = subs.item
515 pns = ns.length - 1
516 end
517 end
518
519 # Forward iterator on the chars of a `Rope`
520 private class RopeIter
521 super IndexedIterator[Char]
522
523 # Position in current `String`
524 var pns: Int
525 # Current `String` being iterated on
526 var str: String
527 # Substrings of the Rope
528 var subs: IndexedIterator[String]
529 # Maximum position to iterate on (e.g. Rope.length)
530 var max: Int
531 # Position (char) in the Rope (0-indexed)
532 var pos: Int
533
534 init(root: RopeString) is old_style_init do
535 subs = new RopeSubstrings(root)
536 pns = 0
537 str = subs.item
538 max = root.length - 1
539 pos = 0
540 end
541
542 init from(root: RopeString, pos: Int) do
543 subs = new RopeSubstrings.from(root, pos)
544 pns = pos - subs.index
545 self.pos = pos
546 str = subs.item
547 max = root.length - 1
548 end
549
550 redef fun item do return str[pns]
551
552 redef fun is_ok do return pos <= max
553
554 redef fun index do return pos
555
556 redef fun next do
557 pns += 1
558 pos += 1
559 if pns < subs.item.length then return
560 if not subs.is_ok then return
561 subs.next
562 if not subs.is_ok then return
563 str = subs.item
564 pns = 0
565 end
566 end
567
568 # Substrings of a Rope (i.e. Reverse postfix iterator on leaves)
569 private class ReverseRopeSubstrings
570 super IndexedIterator[String]
571
572 # Visit Stack
573 var iter: RopeIterPiece is noinit
574 # Position in `Rope`
575 var pos: Int is noinit
576
577 # Current leaf
578 var str: String is noinit
579
580 init(root: RopeString) is old_style_init do
581 var r = new RopeIterPiece(root, false, true, null)
582 pos = root.length - 1
583 var lnod: String = root
584 loop
585 if lnod isa Concat then
586 lnod = lnod.right
587 r = new RopeIterPiece(lnod, false, true, r)
588 else
589 str = lnod
590 iter = r
591 break
592 end
593 end
594 end
595
596 init from(root: RopeString, pos: Int) do
597 var r = new RopeIterPiece(root, false, true, null)
598 var rnod: String = root
599 var off = pos
600 loop
601 if rnod isa Concat then
602 if off >= rnod.left.length then
603 off -= rnod.left.length
604 rnod = rnod.right
605 r = new RopeIterPiece(rnod, false, true, r)
606 else
607 r.ldone = true
608 rnod = rnod.left
609 r = new RopeIterPiece(rnod, false, true, r)
610 end
611 else
612 str = rnod
613 r.ldone = true
614 iter = r
615 self.pos = pos - off
616 break
617 end
618 end
619 end
620
621 redef fun item do return str
622
623 redef fun index do return pos
624
625 redef fun is_ok do return pos >= 0
626
627 redef fun next do
628 if pos < 0 then return
629 var curr = iter.prev
630 var currit = curr.node
631 while curr != null do
632 currit = curr.node
633 if not currit isa Concat then
634 str = currit
635 pos -= str.length
636 iter = curr
637 return
638 end
639 if not curr.rdone then
640 curr.rdone = true
641 curr = new RopeIterPiece(currit.right, false, false, curr)
642 continue
643 end
644 if not curr.ldone then
645 curr.ldone = true
646 curr = new RopeIterPiece(currit.left, false, false, curr)
647 continue
648 end
649 curr = curr.prev
650 end
651 pos = -1
652 end
653 end
654
655 private class RopeBufSubstringIterator
656 super Iterator[String]
657
658 # Iterator on the substrings of the building string
659 var iter: Iterator[String]
660 # Makes a String out of the buffered part of the Ropebuffer
661 var nsstr: String
662 # Did we attain the buffered part ?
663 var nsstr_done = false
664
665 init(str: RopeBuffer) is old_style_init do
666 iter = str.str.substrings
667 nsstr = new FlatString.with_infos(str.ns, str.rpos - str.dumped, str.dumped, str.rpos - 1)
668 if str.length == 0 then nsstr_done = true
669 end
670
671 redef fun is_ok do return iter.is_ok or not nsstr_done
672
673 redef fun item do
674 assert is_ok
675 if iter.is_ok then return iter.item
676 return nsstr
677 end
678
679 redef fun next do
680 if iter.is_ok then
681 iter.next
682 return
683 end
684 nsstr_done = true
685 end
686 end
687
688 # Substrings of a Rope (i.e. Postfix iterator on leaves)
689 private class RopeSubstrings
690 super IndexedIterator[String]
691
692 # Visit Stack
693 var iter: RopeIterPiece is noinit
694 # Position in `Rope`
695 var pos: Int is noinit
696 # Maximum position in `Rope` (i.e. length - 1)
697 var max: Int is noinit
698
699 # Current leaf
700 var str: String is noinit
701
702 init(root: RopeString) is old_style_init do
703 var r = new RopeIterPiece(root, true, false, null)
704 pos = 0
705 max = root.length - 1
706 var rnod: String = root
707 loop
708 if rnod isa Concat then
709 rnod = rnod.left
710 r = new RopeIterPiece(rnod, true, false, r)
711 else
712 str = rnod
713 r.rdone = true
714 iter = r
715 break
716 end
717 end
718 end
719
720 init from(root: RopeString, pos: Int) do
721 var r = new RopeIterPiece(root, true, false, null)
722 max = root.length - 1
723 var rnod: String = root
724 var off = pos
725 loop
726 if rnod isa Concat then
727 if off >= rnod.left.length then
728 r.rdone = true
729 off -= rnod.left.length
730 rnod = rnod.right
731 r = new RopeIterPiece(rnod, true, false, r)
732 else
733 rnod = rnod.left
734 r = new RopeIterPiece(rnod, true, false, r)
735 end
736 else
737 str = rnod
738 r.rdone = true
739 iter = r
740 self.pos = pos - off
741 break
742 end
743 end
744 end
745
746 redef fun item do return str
747
748 redef fun is_ok do return pos <= max
749
750 redef fun index do return pos
751
752 redef fun next do
753 pos += str.length
754 if pos > max then return
755 var it = iter.prev
756 var rnod = it.node
757 loop
758 if not rnod isa Concat then
759 it.ldone = true
760 it.rdone = true
761 str = rnod
762 iter = it.as(not null)
763 break
764 end
765 if not it.ldone then
766 rnod = rnod.left
767 it.ldone = true
768 it = new RopeIterPiece(rnod, false, false, it)
769 else if not it.rdone then
770 it.rdone = true
771 rnod = rnod.right
772 it = new RopeIterPiece(rnod, false, false, it)
773 else
774 it = it.prev
775 rnod = it.node
776 continue
777 end
778 end
779 end
780 end
781
782 # Implementation of a `StringCharView` for `RopeString` objects
783 private class RopeChars
784 super StringCharView
785
786 var tgt: RopeString
787
788 init(s: RopeString) is old_style_init do tgt = s
789
790 redef fun [](i) do
791 return tgt[i]
792 end
793
794 redef fun iterator_from(i) do return new RopeIter.from(tgt, i)
795
796 redef fun reverse_iterator_from(i) do return new RopeReviter.from(tgt, i)
797
798 end
799
800 # An Iterator over a RopeBuffer.
801 class RopeBufferIter
802 super IndexedIterator[Char]
803
804 # Subiterator.
805 var sit: IndexedIterator[Char]
806
807 # Native string iterated over.
808 var ns: NativeString
809
810 # Current position in `ns`.
811 var pns: Int
812
813 # Maximum position iterable.
814 var maxpos: Int
815
816 redef var index: Int
817
818 # Init the iterator from a RopeBuffer.
819 init(t: RopeBuffer) is old_style_init do
820 ns = t.ns
821 maxpos = t.rpos
822 sit = t.str.chars.iterator
823 pns = t.dumped
824 index = 0
825 end
826
827 # Init the iterator from a RopeBuffer starting from `pos`.
828 init from(t: RopeBuffer, pos: Int) do
829 ns = t.ns
830 maxpos = t.length
831 sit = t.str.chars.iterator_from(pos)
832 pns = pos - t.str.length
833 index = pos
834 end
835
836 redef fun is_ok do return index < maxpos
837
838 redef fun item do
839 if sit.is_ok then return sit.item
840 return ns[pns]
841 end
842
843 redef fun next do
844 index += 1
845 if sit.is_ok then
846 sit.next
847 else
848 pns += 1
849 end
850 end
851 end
852
853 # Reverse iterator over a RopeBuffer.
854 class RopeBufferReviter
855 super IndexedIterator[Char]
856
857 # Subiterator.
858 var sit: IndexedIterator[Char]
859
860 # Native string iterated over.
861 var ns: NativeString
862
863 # Current position in `ns`.
864 var pns: Int
865
866 redef var index: Int
867
868 # Init the iterator from a RopeBuffer.
869 init(tgt: RopeBuffer) is old_style_init do
870 sit = tgt.str.chars.reverse_iterator
871 pns = tgt.rpos - 1
872 index = tgt.length - 1
873 ns = tgt.ns
874 end
875
876 # Init the iterator from a RopeBuffer starting from `pos`.
877 init from(tgt: RopeBuffer, pos: Int) do
878 sit = tgt.str.chars.reverse_iterator_from(pos - tgt.rpos - tgt.dumped)
879 pns = pos - tgt.str.length
880 index = pos
881 ns = tgt.ns
882 end
883
884 redef fun is_ok do return index > 0
885
886 redef fun item do
887 if pns >= 0 then return ns[pns]
888 return sit.item
889 end
890
891 redef fun next do
892 index -= 1
893 if pns >= 0 then
894 pns -= 1
895 else
896 sit.next
897 end
898 end
899 end
900
901 # View on the chars of a `RopeBuffer`
902 class RopeBufferChars
903 super BufferCharView
904
905 redef type SELFTYPE: RopeBuffer
906
907 redef fun [](i) do
908 if i < target.str.length then
909 return target.str[i]
910 else
911 return target.ns[i - target.str.length]
912 end
913 end
914
915 redef fun []=(i,c) do
916 if i == target.length then target.add c
917 if i < target.str.length then
918 var s = target.str
919 var l = s.substring(0, i)
920 var r = s.substring_from(i + 1)
921 target.str = l + c.to_s + r
922 else
923 target.ns[i - target.str.length] = c
924 end
925 end
926
927 redef fun add(c) do target.add c
928
929 redef fun push(c) do target.add c
930
931 redef fun iterator_from(i) do return new RopeBufferIter.from(target, i)
932
933 redef fun reverse_iterator_from(i) do return new RopeBufferReviter.from(target, i)
934 end