9ef2f4dc0504c6eef53e3a6abc5cbf0ff0c59546
[nit.git] / lib / standard / ropes.nit
1 # This file is part of NIT (http://www.nitlanguage.org).
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 # Tree-based representation of a String.
16 #
17 # Ropes are a data structure introduced in a 1995 paper from
18 # Hans J. Boehm, Russ Atkinson and Michael Plass.
19 # See : `Ropes : an Alternative to Strings`, `Software - Practice and Experience,
20 # Vol. 25(12), 1315-1330 (December 1995)`.
21 #
22 # The implementation developed here provides an automatic change
23 # of data structure depending on the length of the leaves.
24 #
25 # The length from which a `Rope` is built from a `flat` string
26 # if defined at top-level (see `maxlen`) and can be redefined by clients
27 # depending on their needs (e.g. if you need to bench the speed of
28 # the creation of concat nodes, lower the size of maxlen).
29 #
30 # A Rope as defined in the original paper is a Tree made of concatenation
31 # nodes and containing `Flat` (that is Array-based) strings as Leaves.
32 #
33 # Example :
34 #
35 # ` Concat `
36 # ` / \ `
37 # ` Concat Concat `
38 # ` / \ / \ `
39 # `"My" " Name" " is" " Simon." `
40 #
41 # Note that the above example is not representative of the actual implementation
42 # of `Ropes`, since short leaves are merged to keep the rope at an acceptable
43 # height (hence, this rope here might actually be a `FlatString` !).
44 module ropes
45
46 intrude import string
47
48 # Maxlen is the maximum length of a Leaf node
49 #
50 # When concatenating two leaves, if `new_length` > `maxlen`,
51 # A `Concat` node is created instead
52 #
53 # Its purpose is to limit the depth of the `Rope` (this
54 # improves performance when accessing/iterating).
55 fun maxlen: Int do return 64
56
57 # String using a tree-based representation with leaves as `FlatStrings`
58 private abstract class Rope
59 super Text
60 end
61
62 private abstract class RopeString
63 super Rope
64 super String
65
66 redef fun chars is cached do return new RopeChars(self)
67 end
68
69 # Node that represents a concatenation between two `String`
70 private class Concat
71 super RopeString
72
73 redef var length: Int
74
75 redef fun substrings do return new RopeSubstrings(self)
76
77 redef fun empty do return ""
78
79 redef fun to_cstring is cached do
80 var len = length
81 var ns = new NativeString(len + 1)
82 ns[len] = '\0'
83 var off = 0
84 for i in substrings do
85 var ilen = i.length
86 i.as(FlatString).items.copy_to(ns, ilen, i.as(FlatString).index_from, off)
87 off += ilen
88 end
89 return ns
90 end
91
92 # Left child of the node
93 var left: String
94 # Right child of the node
95 var right: String
96
97 init(l: String, r: String) is old_style_init do
98 left = l
99 right = r
100 length = l.length + r.length
101 end
102
103 redef fun output do
104 left.output
105 right.output
106 end
107
108 redef fun iterator do return new RopeIter(self)
109
110 redef fun *(i) do
111 var x: String = self
112 for j in [1 .. i[ do x += self
113 return x
114 end
115
116 redef fun [](i) do
117 var llen = left.length
118 if i >= llen then return right[i - llen]
119 return left[i]
120 end
121
122 redef fun substring(from, len) do
123 var llen = left.length
124 if from < llen then
125 if from + len < llen then return left.substring(from,len)
126 var lsublen = llen - from
127 return left.substring_from(from) + right.substring(0, len - lsublen)
128 else
129 return right.substring(from - llen, len)
130 end
131 end
132
133 redef fun reversed do return new Concat(right.reversed, left.reversed)
134
135 redef fun insert_at(s, pos) do
136 if pos > left.length then
137 return left + right.insert_at(s, pos - left.length)
138 end
139 return left.insert_at(s, pos) + right
140 end
141
142 redef fun to_upper do return new Concat(left.to_upper, right.to_upper)
143
144 redef fun to_lower do return new Concat(left.to_lower, right.to_lower)
145
146 redef fun +(o) do
147 var s = o.to_s
148 var slen = s.length
149 if s isa Concat then
150 return new Concat(self, s)
151 else
152 var r = right
153 var rlen = r.length
154 if rlen + slen > maxlen then return new Concat(left, new Concat(r, s))
155 return new Concat(left, r + s)
156 end
157 end
158 end
159
160 # Mutable `Rope`, optimized for concatenation operations
161 #
162 # A `RopeBuffer` is an efficient way of building a `String` when
163 # concatenating small strings.
164 #
165 # It does concatenations in an optimized way by having a
166 # mutable part and an immutable part built by efficiently
167 # concatenating strings in chain.
168 #
169 # Every concatenation operation is done by copying a string to
170 # the mutable part and flushing it when full.
171 #
172 # However, when a long string is appended to the `Buffer`,
173 # the concatenation is done at it would be in a `Rope`.
174 class RopeBuffer
175 super Rope
176 super Buffer
177
178 redef fun chars: Sequence[Char] is cached do return new RopeBufferChars(self)
179
180 # The final string being built on the fly
181 private var str: String is noinit
182
183 # Current concatenation buffer
184 private var ns: NativeString is noinit
185
186 # Next available (e.g. unset) character in the `Buffer`
187 private var rpos = 0
188
189 # Keeps track of the buffer's currently dumped part
190 #
191 # This might happen if for instance, a String was being
192 # built by concatenating small parts of string and suddenly
193 # a long string (length > maxlen) is appended.
194 private var dumped: Int is noinit
195
196 # Length of the complete rope
197 redef var length = 0
198
199 # Length of the mutable part
200 #
201 # Is also used as base to compute the size of the next
202 # mutable native string (`ns`)
203 private var buf_size: Int is noinit
204
205 redef fun substrings: Iterator[String] do return new RopeBufSubstringIterator(self)
206
207 # Builds an empty `RopeBuffer`
208 init do
209 str = ""
210 ns = new NativeString(maxlen)
211 buf_size = maxlen
212 dumped = 0
213 end
214
215 # Builds a new `RopeBuffer` with `str` in it.
216 init from(str: String) do
217 self.str = str
218 ns = new NativeString(maxlen)
219 buf_size = maxlen
220 length = str.length
221 dumped = 0
222 end
223
224 # Resets the informations of the `Buffer`
225 #
226 # This is called when doing in-place modifications
227 # on a previously to_s'd `RopeBuffer`
228 private fun reset do
229 var nns = new NativeString(buf_size)
230 var blen = rpos - dumped
231 ns.copy_to(nns, blen, dumped, 0)
232 dumped = 0
233 rpos = blen
234 written = false
235 end
236
237 redef fun empty do return new RopeBuffer
238
239 redef fun clear do
240 str = ""
241 length = 0
242 rpos = 0
243 dumped = 0
244 end
245
246 redef fun substring(from, count) do
247 var strlen = str.length
248
249 if from < 0 then
250 count += from
251 if count < 0 then count = 0
252 from = 0
253 end
254
255 if count > length then count = length - from
256
257 if count == 0 then return empty
258
259 if from < strlen then
260 var subpos = strlen - from
261 if count <= subpos then
262 return new RopeBuffer.from(str.substring(from, count))
263 else
264 var l = str.substring_from(from)
265 var rem = count - subpos
266 var nns = new NativeString(rem)
267 ns.copy_to(nns, rem, dumped, 0)
268 return new RopeBuffer.from(l + nns.to_s_with_length(rem))
269 end
270 else
271 var nns = new NativeString(count)
272 ns.copy_to(nns, count, dumped, 0)
273 return new RopeBuffer.from(nns.to_s_with_length(count))
274 end
275 end
276
277 redef fun append(s) do
278 var slen = s.length
279 length += slen
280 var rp = rpos
281 if s isa Rope or slen > maxlen then
282 if rp > 0 and dumped != rp then
283 str += new FlatString.with_infos(ns, rp - dumped, dumped, rp - 1)
284 dumped = rp
285 end
286 str = str + s
287 return
288 end
289 var remsp = buf_size - rp
290 var sits: NativeString
291 var begin: Int
292 if s isa FlatString then
293 begin = s.index_from
294 sits = s.items
295 else if s isa FlatBuffer then
296 begin = 0
297 sits = s.items
298 else
299 if slen <= remsp then
300 for i in s.chars do
301 ns[rpos] = i
302 rpos += 1
303 end
304 else
305 var spos = 0
306 for i in [0..remsp[ do
307 ns[rpos] = s[spos]
308 rpos += 1
309 spos += 1
310 end
311 dump_buffer
312 while spos < slen do
313 ns[rpos] = s[spos]
314 spos += 1
315 rpos += 1
316 end
317 end
318 return
319 end
320 if slen <= remsp then
321 if remsp <= 0 then
322 dump_buffer
323 rpos = 0
324 else
325 sits.copy_to(ns, slen, begin, rp)
326 rpos += slen
327 end
328 else
329 sits.copy_to(ns, remsp, begin, rp)
330 rpos = buf_size
331 dump_buffer
332 var nlen = slen - remsp
333 sits.copy_to(ns, nlen, begin + remsp, 0)
334 rpos = nlen
335 end
336 end
337
338 redef fun add(c) do
339 var rp = rpos
340 length += 1
341 ns[rp] = c
342 rp += 1
343 if rp == buf_size then
344 rpos = rp
345 dump_buffer
346 rp = 0
347 end
348 rpos = rp
349 end
350
351 # Converts the Buffer to a FlatString, appends it to
352 # the final String and re-allocates a new larger Buffer.
353 private fun dump_buffer do
354 written = false
355 var nstr = new FlatString.with_infos(ns, rpos - dumped, dumped, rpos - 1)
356 str += nstr
357 var bs = buf_size
358 bs = bs * 2
359 ns = new NativeString(bs)
360 buf_size = bs
361 dumped = 0
362 end
363
364 redef fun output do
365 str.output
366 new FlatString.with_infos(ns, rpos - dumped, dumped, rpos - 1).output
367 end
368
369 # Enlarge is useless here since the `Buffer`
370 # part is automatically dumped when necessary.
371 #
372 # Also, since the buffer can not be overused by a
373 # single string, there is no need for manual
374 # resizing.
375 #
376 # "You have no power here !"
377 redef fun enlarge(i) do end
378
379 redef fun to_s do
380 written = true
381 var nnslen = rpos - dumped
382 if nnslen == 0 then return str
383 return str + new FlatString.with_infos(ns, rpos - dumped, dumped, rpos - 1)
384 end
385
386 redef fun reverse do
387 str = str.reversed
388 var nns = new NativeString(buf_size)
389 var j = rpos
390 var mits = ns
391 for i in [0 .. rpos[ do
392 nns[i] = mits[j]
393 j -= 1
394 end
395 ns = nns
396 end
397
398 redef fun upper do
399 if written then reset
400 str = str.to_upper
401 var mits = ns
402 for i in [0 .. rpos[ do
403 mits[i] = mits[i].to_upper
404 end
405 end
406
407 redef fun lower do
408 if written then reset
409 str = str.to_lower
410 var mits = ns
411 for i in [0 .. rpos[ do
412 mits[i] = mits[i].to_lower
413 end
414 end
415 end
416
417 redef class FlatString
418
419 redef fun insert_at(s, pos) do
420 var l = substring(0, pos)
421 var r = substring_from(pos)
422 return l + s + r
423 end
424
425 redef fun +(o) do
426 var s = o.to_s
427 var slen = s.length
428 var mlen = length
429 if slen == 0 then return self
430 if mlen == 0 then return s
431 var nlen = slen + mlen
432 if s isa FlatString then
433 if nlen > maxlen then return new Concat(self, s)
434 var mits = items
435 var sifrom = s.index_from
436 var mifrom = index_from
437 var sits = s.items
438 var ns = new NativeString(nlen + 1)
439 mits.copy_to(ns, mlen, mifrom, 0)
440 sits.copy_to(ns, slen, sifrom, mlen)
441 return ns.to_s_with_length(nlen)
442 else if s isa Concat then
443 var sl = s.left
444 var sllen = sl.length
445 if sllen + mlen > maxlen then return new Concat(self, s)
446 return new Concat(self + sl, s.right)
447 else
448 abort
449 end
450 end
451 end
452
453 # A simple linked list for use with iterators
454 private class RopeIterPiece
455 # The encapsulated node of the `Rope`
456 var node: String
457 # Was its left child (if any) visited ?
458 var ldone: Bool
459 # Was its right child (if any) visited ?
460 var rdone: Bool
461 # The previous node in the list.
462 var prev: nullable RopeIterPiece
463 end
464
465 # A reverse iterator capable of working with `Rope` objects
466 private class RopeReviter
467 super IndexedIterator[Char]
468
469 # Current NativeString
470 var ns: String
471 # Current position in NativeString
472 var pns: Int
473 # Position in the Rope (0-indexed)
474 var pos: Int
475 # Iterator on the substrings, does the Postfix part of
476 # the Rope traversal.
477 var subs: IndexedIterator[String]
478
479 init(root: RopeString) is old_style_init do
480 pos = root.length - 1
481 subs = new ReverseRopeSubstrings(root)
482 ns = subs.item
483 pns = ns.length - 1
484 end
485
486 init from(root: RopeString, pos: Int) do
487 self.pos = pos
488 subs = new ReverseRopeSubstrings.from(root, pos)
489 ns = subs.item
490 pns = pos - subs.index
491 end
492
493 redef fun index do return pos
494
495 redef fun is_ok do return pos >= 0
496
497 redef fun item do return ns[pns]
498
499 redef fun next do
500 pns -= 1
501 pos -= 1
502 if pns >= 0 then return
503 if not subs.is_ok then return
504 subs.next
505 if not subs.is_ok then return
506 ns = subs.item
507 pns = ns.length - 1
508 end
509 end
510
511 # Forward iterator on the chars of a `Rope`
512 private class RopeIter
513 super IndexedIterator[Char]
514
515 # Position in current `String`
516 var pns: Int
517 # Current `String` being iterated on
518 var str: String
519 # Substrings of the Rope
520 var subs: IndexedIterator[String]
521 # Maximum position to iterate on (e.g. Rope.length)
522 var max: Int
523 # Position (char) in the Rope (0-indexed)
524 var pos: Int
525
526 init(root: RopeString) is old_style_init do
527 subs = new RopeSubstrings(root)
528 pns = 0
529 str = subs.item
530 max = root.length - 1
531 pos = 0
532 end
533
534 init from(root: RopeString, pos: Int) do
535 subs = new RopeSubstrings.from(root, pos)
536 pns = pos - subs.index
537 self.pos = pos
538 str = subs.item
539 max = root.length - 1
540 end
541
542 redef fun item do return str[pns]
543
544 redef fun is_ok do return pos <= max
545
546 redef fun index do return pos
547
548 redef fun next do
549 pns += 1
550 pos += 1
551 if pns < subs.item.length then return
552 if not subs.is_ok then return
553 subs.next
554 if not subs.is_ok then return
555 str = subs.item
556 pns = 0
557 end
558 end
559
560 # Substrings of a Rope (i.e. Reverse postfix iterator on leaves)
561 private class ReverseRopeSubstrings
562 super IndexedIterator[String]
563
564 # Visit Stack
565 var iter: RopeIterPiece is noinit
566 # Position in `Rope`
567 var pos: Int is noinit
568
569 # Current leaf
570 var str: String is noinit
571
572 init(root: RopeString) is old_style_init do
573 var r = new RopeIterPiece(root, false, true, null)
574 pos = root.length - 1
575 var lnod: String = root
576 loop
577 if lnod isa Concat then
578 lnod = lnod.right
579 r = new RopeIterPiece(lnod, false, true, r)
580 else
581 str = lnod
582 iter = r
583 break
584 end
585 end
586 end
587
588 init from(root: RopeString, pos: Int) do
589 var r = new RopeIterPiece(root, false, true, null)
590 var rnod: String = root
591 var off = pos
592 loop
593 if rnod isa Concat then
594 if off >= rnod.left.length then
595 off -= rnod.left.length
596 rnod = rnod.right
597 r = new RopeIterPiece(rnod, false, true, r)
598 else
599 r.ldone = true
600 rnod = rnod.left
601 r = new RopeIterPiece(rnod, false, true, r)
602 end
603 else
604 str = rnod
605 r.ldone = true
606 iter = r
607 self.pos = pos - off
608 break
609 end
610 end
611 end
612
613 redef fun item do return str
614
615 redef fun index do return pos
616
617 redef fun is_ok do return pos >= 0
618
619 redef fun next do
620 if pos < 0 then return
621 var curr = iter.prev
622 var currit = curr.node
623 while curr != null do
624 currit = curr.node
625 if not currit isa Concat then
626 str = currit
627 pos -= str.length
628 iter = curr
629 return
630 end
631 if not curr.rdone then
632 curr.rdone = true
633 curr = new RopeIterPiece(currit.right, false, false, curr)
634 continue
635 end
636 if not curr.ldone then
637 curr.ldone = true
638 curr = new RopeIterPiece(currit.left, false, false, curr)
639 continue
640 end
641 curr = curr.prev
642 end
643 pos = -1
644 end
645 end
646
647 private class RopeBufSubstringIterator
648 super Iterator[String]
649
650 # Iterator on the substrings of the building string
651 var iter: Iterator[String]
652 # Makes a String out of the buffered part of the Ropebuffer
653 var nsstr: String
654 # Did we attain the buffered part ?
655 var nsstr_done = false
656
657 init(str: RopeBuffer) is old_style_init do
658 iter = str.str.substrings
659 nsstr = new FlatString.with_infos(str.ns, str.rpos - str.dumped, str.dumped, str.rpos - 1)
660 if str.length == 0 then nsstr_done = true
661 end
662
663 redef fun is_ok do return iter.is_ok or not nsstr_done
664
665 redef fun item do
666 assert is_ok
667 if iter.is_ok then return iter.item
668 return nsstr
669 end
670
671 redef fun next do
672 if iter.is_ok then
673 iter.next
674 return
675 end
676 nsstr_done = true
677 end
678 end
679
680 # Substrings of a Rope (i.e. Postfix iterator on leaves)
681 private class RopeSubstrings
682 super IndexedIterator[String]
683
684 # Visit Stack
685 var iter: RopeIterPiece is noinit
686 # Position in `Rope`
687 var pos: Int is noinit
688 # Maximum position in `Rope` (i.e. length - 1)
689 var max: Int is noinit
690
691 # Current leaf
692 var str: String is noinit
693
694 init(root: RopeString) is old_style_init do
695 var r = new RopeIterPiece(root, true, false, null)
696 pos = 0
697 max = root.length - 1
698 var rnod: String = root
699 loop
700 if rnod isa Concat then
701 rnod = rnod.left
702 r = new RopeIterPiece(rnod, true, false, r)
703 else
704 str = rnod
705 r.rdone = true
706 iter = r
707 break
708 end
709 end
710 end
711
712 init from(root: RopeString, pos: Int) do
713 var r = new RopeIterPiece(root, true, false, null)
714 max = root.length - 1
715 var rnod: String = root
716 var off = pos
717 loop
718 if rnod isa Concat then
719 if off >= rnod.left.length then
720 r.rdone = true
721 off -= rnod.left.length
722 rnod = rnod.right
723 r = new RopeIterPiece(rnod, true, false, r)
724 else
725 rnod = rnod.left
726 r = new RopeIterPiece(rnod, true, false, r)
727 end
728 else
729 str = rnod
730 r.rdone = true
731 iter = r
732 self.pos = pos - off
733 break
734 end
735 end
736 end
737
738 redef fun item do return str
739
740 redef fun is_ok do return pos <= max
741
742 redef fun index do return pos
743
744 redef fun next do
745 pos += str.length
746 if pos > max then return
747 var it = iter.prev
748 var rnod = it.node
749 loop
750 if not rnod isa Concat then
751 it.ldone = true
752 it.rdone = true
753 str = rnod
754 iter = it.as(not null)
755 break
756 end
757 if not it.ldone then
758 rnod = rnod.left
759 it.ldone = true
760 it = new RopeIterPiece(rnod, false, false, it)
761 else if not it.rdone then
762 it.rdone = true
763 rnod = rnod.right
764 it = new RopeIterPiece(rnod, false, false, it)
765 else
766 it = it.prev
767 rnod = it.node
768 continue
769 end
770 end
771 end
772 end
773
774 # Implementation of a `StringCharView` for `RopeString` objects
775 private class RopeChars
776 super StringCharView
777
778 var tgt: RopeString
779
780 init(s: RopeString) is old_style_init do tgt = s
781
782 redef fun [](i) do
783 return tgt[i]
784 end
785
786 redef fun iterator_from(i) do return new RopeIter.from(tgt, i)
787
788 redef fun reverse_iterator_from(i) do return new RopeReviter.from(tgt, i)
789
790 end
791
792 # An Iterator over a RopeBuffer.
793 class RopeBufferIter
794 super IndexedIterator[Char]
795
796 # Subiterator.
797 var sit: IndexedIterator[Char]
798
799 # Native string iterated over.
800 var ns: NativeString
801
802 # Current position in `ns`.
803 var pns: Int
804
805 # Maximum position iterable.
806 var maxpos: Int
807
808 redef var index: Int
809
810 # Init the iterator from a RopeBuffer.
811 init(t: RopeBuffer) is old_style_init do
812 ns = t.ns
813 maxpos = t.rpos
814 sit = t.str.chars.iterator
815 pns = t.dumped
816 index = 0
817 end
818
819 # Init the iterator from a RopeBuffer starting from `pos`.
820 init from(t: RopeBuffer, pos: Int) do
821 ns = t.ns
822 maxpos = t.length
823 sit = t.str.chars.iterator_from(pos)
824 pns = pos - t.str.length
825 index = pos
826 end
827
828 redef fun is_ok do return index < maxpos
829
830 redef fun item do
831 if sit.is_ok then return sit.item
832 return ns[pns]
833 end
834
835 redef fun next do
836 index += 1
837 if sit.is_ok then
838 sit.next
839 else
840 pns += 1
841 end
842 end
843 end
844
845 # Reverse iterator over a RopeBuffer.
846 class RopeBufferReviter
847 super IndexedIterator[Char]
848
849 # Subiterator.
850 var sit: IndexedIterator[Char]
851
852 # Native string iterated over.
853 var ns: NativeString
854
855 # Current position in `ns`.
856 var pns: Int
857
858 redef var index: Int
859
860 # Init the iterator from a RopeBuffer.
861 init(tgt: RopeBuffer) is old_style_init do
862 sit = tgt.str.chars.reverse_iterator
863 pns = tgt.rpos - 1
864 index = tgt.length - 1
865 ns = tgt.ns
866 end
867
868 # Init the iterator from a RopeBuffer starting from `pos`.
869 init from(tgt: RopeBuffer, pos: Int) do
870 sit = tgt.str.chars.reverse_iterator_from(pos - tgt.rpos - tgt.dumped)
871 pns = pos - tgt.str.length
872 index = pos
873 ns = tgt.ns
874 end
875
876 redef fun is_ok do return index > 0
877
878 redef fun item do
879 if pns >= 0 then return ns[pns]
880 return sit.item
881 end
882
883 redef fun next do
884 index -= 1
885 if pns >= 0 then
886 pns -= 1
887 else
888 sit.next
889 end
890 end
891 end
892
893 # View on the chars of a `RopeBuffer`
894 class RopeBufferChars
895 super BufferCharView
896
897 redef type SELFTYPE: RopeBuffer
898
899 redef fun [](i) do
900 if i < target.str.length then
901 return target.str[i]
902 else
903 return target.ns[i - target.str.length]
904 end
905 end
906
907 redef fun []=(i,c) do
908 if i == target.length then target.add c
909 if i < target.str.length then
910 var s = target.str
911 var l = s.substring(0, i)
912 var r = s.substring_from(i + 1)
913 target.str = l + c.to_s + r
914 else
915 target.ns[i - target.str.length] = c
916 end
917 end
918
919 redef fun add(c) do target.add c
920
921 redef fun push(c) do target.add c
922
923 redef fun iterator_from(i) do return new RopeBufferIter.from(target, i)
924
925 redef fun reverse_iterator_from(i) do return new RopeBufferReviter.from(target, i)
926 end