text/flat: FlatText::char_to_byte_index shortcut length_of_char_at if possible
[nit.git] / lib / core / text / flat.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # This file is free software, which comes along with NIT. This software is
4 # distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
5 # without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
6 # PARTICULAR PURPOSE. You can modify it is you want, provided this header
7 # is kept unaltered, and a notification of the changes is added.
8 # You are allowed to redistribute it and sell it, alone or is a part of
9 # another product.
10
11 # All the array-based text representations
12 module flat
13
14 intrude import abstract_text
15 intrude import native
16
17 `{
18 #include <stdio.h>
19 #include <string.h>
20 `}
21
22 private class FlatSubstringsIter
23 super Iterator[FlatText]
24
25 var tgt: nullable FlatText
26
27 redef fun item do
28 assert is_ok
29 return tgt.as(not null)
30 end
31
32 redef fun is_ok do return tgt != null
33
34 redef fun next do tgt = null
35 end
36
37 redef class FlatText
38
39 # First byte of the NativeString
40 protected fun first_byte: Int do return 0
41
42 # Last byte of the NativeString
43 protected fun last_byte: Int do return first_byte + _bytelen - 1
44
45 # Cache of the latest position (char) explored in the string
46 var position: Int = 0
47
48 # Cached position (bytes) in the NativeString underlying the String
49 var bytepos: Int = 0
50
51 # Index of the character `index` in `_items`
52 fun char_to_byte_index(index: Int): Int do
53 var dpos = index - _position
54 var b = _bytepos
55 var its = _items
56
57 if dpos == 1 then
58 if its[b] & 0x80u8 == 0x00u8 then
59 b += 1
60 else
61 b += its.length_of_char_at(b)
62 end
63 _bytepos = b
64 _position = index
65 return b
66 end
67 if dpos == -1 then
68 b = its.find_beginning_of_char_at(b - 1)
69 _bytepos = b
70 _position = index
71 return b
72 end
73 if dpos == 0 then return b
74
75 var ln = _length
76 var pos = _position
77 # Find best insertion point
78 var delta_begin = index
79 var delta_end = (ln - 1) - index
80 var delta_cache = (pos - index).abs
81 var min = delta_begin
82
83 if delta_cache < min then min = delta_cache
84 if delta_end < min then min = delta_end
85
86 var ns_i: Int
87 var my_i: Int
88
89 if min == delta_cache then
90 ns_i = _bytepos
91 my_i = pos
92 else if min == delta_begin then
93 ns_i = first_byte
94 my_i = 0
95 else
96 ns_i = its.find_beginning_of_char_at(last_byte)
97 my_i = _length - 1
98 end
99
100 ns_i = its.char_to_byte_index_cached(index, my_i, ns_i)
101
102 _position = index
103 _bytepos = ns_i
104
105 return ns_i
106 end
107
108 # By escaping `self` to HTML, how many more bytes will be needed ?
109 fun chars_to_html_escape: Int do
110 var its = _items
111 var max = last_byte
112 var pos = first_byte
113 var endlen = 0
114 while pos <= max do
115 var c = its[pos]
116 if c == 0x3Cu8 then
117 endlen += 3
118 else if c == 0x3Eu8 then
119 endlen += 3
120 else if c == 0x26u8 then
121 endlen += 4
122 else if c == 0x22u8 then
123 endlen += 4
124 else if c == 0x27u8 then
125 endlen += 4
126 else if c == 0x2Fu8 then
127 endlen += 4
128 end
129 pos += 1
130 end
131 return endlen
132 end
133
134 redef fun html_escape
135 do
136 var extra = chars_to_html_escape
137 if extra == 0 then return to_s
138 var its = _items
139 var max = last_byte
140 var pos = first_byte
141 var nlen = extra + _bytelen
142 var nits = new NativeString(nlen)
143 var outpos = 0
144 while pos <= max do
145 var c = its[pos]
146 # Special codes:
147 # Some HTML characters are used as meta-data, they need
148 # to be replaced by an HTML-Escaped equivalent
149 #
150 # * 0x3C (<) => &lt;
151 # * 0x3E (>) => &gt;
152 # * 0x26 (&) => &amp;
153 # * 0x22 (") => &#34;
154 # * 0x27 (') => &#39;
155 # * 0x2F (/) => &#47;
156 if c == 0x3Cu8 then
157 nits[outpos] = 0x26u8
158 nits[outpos + 1] = 0x6Cu8
159 nits[outpos + 2] = 0x74u8
160 nits[outpos + 3] = 0x3Bu8
161 outpos += 4
162 else if c == 0x3Eu8 then
163 nits[outpos] = 0x26u8
164 nits[outpos + 1] = 0x67u8
165 nits[outpos + 2] = 0x74u8
166 nits[outpos + 3] = 0x3Bu8
167 outpos += 4
168 else if c == 0x26u8 then
169 nits[outpos] = 0x26u8
170 nits[outpos + 1] = 0x61u8
171 nits[outpos + 2] = 0x6Du8
172 nits[outpos + 3] = 0x70u8
173 nits[outpos + 4] = 0x3Bu8
174 outpos += 5
175 else if c == 0x22u8 then
176 nits[outpos] = 0x26u8
177 nits[outpos + 1] = 0x23u8
178 nits[outpos + 2] = 0x33u8
179 nits[outpos + 3] = 0x34u8
180 nits[outpos + 4] = 0x3Bu8
181 outpos += 5
182 else if c == 0x27u8 then
183 nits[outpos] = 0x26u8
184 nits[outpos + 1] = 0x23u8
185 nits[outpos + 2] = 0x33u8
186 nits[outpos + 3] = 0x39u8
187 nits[outpos + 4] = 0x3Bu8
188 outpos += 5
189 else if c == 0x2Fu8 then
190 nits[outpos] = 0x26u8
191 nits[outpos + 1] = 0x23u8
192 nits[outpos + 2] = 0x34u8
193 nits[outpos + 3] = 0x37u8
194 nits[outpos + 4] = 0x3Bu8
195 outpos += 5
196 else
197 nits[outpos] = c
198 outpos += 1
199 end
200 pos += 1
201 end
202 var s = new FlatString.with_infos(nits, nlen, 0)
203 return s
204 end
205
206 # By escaping `self` to C, how many more bytes will be needed ?
207 #
208 # This enables a double-optimization in `escape_to_c` since if this
209 # method returns 0, then `self` does not need escaping and can be
210 # returned as-is
211 fun chars_to_escape_to_c: Int do
212 var its = _items
213 var max = last_byte
214 var pos = first_byte
215 var req_esc = 0
216 while pos <= max do
217 var c = its[pos]
218 if c == 0x0Au8 then
219 req_esc += 1
220 else if c == 0x09u8 then
221 req_esc += 1
222 else if c == 0x22u8 then
223 req_esc += 1
224 else if c == 0x27u8 then
225 req_esc += 1
226 else if c == 0x5Cu8 then
227 req_esc += 1
228 else if c < 32u8 then
229 req_esc += 3
230 end
231 pos += 1
232 end
233 return req_esc
234 end
235
236 redef fun escape_to_c do
237 var ln_extra = chars_to_escape_to_c
238 if ln_extra == 0 then return self.to_s
239 var its = _items
240 var max = last_byte
241 var nlen = _bytelen + ln_extra
242 var nns = new NativeString(nlen)
243 var pos = first_byte
244 var opos = 0
245 while pos <= max do
246 var c = its[pos]
247 # Special codes:
248 #
249 # Any byte with value < 32 is a control character
250 # All their uses will be replaced by their octal
251 # value in C.
252 #
253 # There are two exceptions however:
254 #
255 # * 0x09 => \t
256 # * 0x0A => \n
257 #
258 # Aside from the code points above, the following are:
259 #
260 # * 0x22 => \"
261 # * 0x27 => \'
262 # * 0x5C => \\
263 if c == 0x09u8 then
264 nns[opos] = 0x5Cu8
265 nns[opos + 1] = 0x74u8
266 opos += 2
267 else if c == 0x0Au8 then
268 nns[opos] = 0x5Cu8
269 nns[opos + 1] = 0x6Eu8
270 opos += 2
271 else if c == 0x22u8 then
272 nns[opos] = 0x5Cu8
273 nns[opos + 1] = 0x22u8
274 opos += 2
275 else if c == 0x27u8 then
276 nns[opos] = 0x5Cu8
277 nns[opos + 1] = 0x27u8
278 opos += 2
279 else if c == 0x5Cu8 then
280 nns[opos] = 0x5Cu8
281 nns[opos + 1] = 0x5Cu8
282 opos += 2
283 else if c < 32u8 then
284 nns[opos] = 0x5Cu8
285 nns[opos + 1] = 0x30u8
286 nns[opos + 2] = ((c & 0x38u8) >> 3) + 0x30u8
287 nns[opos + 3] = (c & 0x07u8) + 0x30u8
288 opos += 4
289 else
290 nns[opos] = c
291 opos += 1
292 end
293 pos += 1
294 end
295 return nns.to_s_unsafe(nlen)
296 end
297
298 redef fun [](index) do
299 assert index >= 0 and index < _length
300 return fetch_char_at(index)
301 end
302
303 # Gets a `Char` at `index` in `self`
304 #
305 # WARNING: Use at your own risks as no bound-checking is done
306 fun fetch_char_at(index: Int): Char do
307 var i = char_to_byte_index(index)
308 var items = _items
309 var b = items[i]
310 if b & 0x80u8 == 0x00u8 then return b.ascii
311 return items.char_at(i)
312 end
313
314 # If `self` contains only digits and alpha <= 'f', return the corresponding integer.
315 #
316 # assert "ff".to_hex == 255
317 redef fun to_hex(pos, ln) do
318 var res = 0
319 if pos == null then pos = 0
320 if ln == null then ln = length - pos
321 pos = char_to_byte_index(pos)
322 var its = _items
323 var max = pos + ln
324 for i in [pos .. max[ do
325 res <<= 4
326 res += its[i].ascii.from_hex
327 end
328 return res
329 end
330 end
331
332 # Immutable strings of characters.
333 class FlatString
334 super FlatText
335 super String
336
337 # Index at which `self` begins in `_items`, inclusively
338 redef var first_byte is noinit
339
340 redef var chars = new FlatStringCharView(self) is lazy
341
342 redef var bytes = new FlatStringByteView(self) is lazy
343
344 redef var to_cstring is lazy do
345 var blen = _bytelen
346 var new_items = new NativeString(blen + 1)
347 _items.copy_to(new_items, blen, _first_byte, 0)
348 new_items[blen] = 0u8
349 return new_items
350 end
351
352 redef fun reversed do
353 var b = new FlatBuffer.with_capacity(_bytelen + 1)
354 var i = _length - 1
355 while i >= 0 do
356 b.add self.fetch_char_at(i)
357 i -= 1
358 end
359 var s = b.to_s.as(FlatString)
360 s._length = self._length
361 return s
362 end
363
364 redef fun fast_cstring do return _items.fast_cstring(_first_byte)
365
366 redef fun substring_from(from) do
367 if from >= self._length then return empty
368 if from <= 0 then return self
369 var c = char_to_byte_index(from)
370 var st = c - _first_byte
371 var fln = bytelen - st
372 return new FlatString.full(items, fln, c, _length - from)
373 end
374
375 redef fun substring(from, count)
376 do
377 if count <= 0 then return ""
378
379 if from < 0 then
380 count += from
381 if count < 0 then return ""
382 from = 0
383 end
384
385 var ln = _length
386 if (count + from) > ln then count = ln - from
387 if count <= 0 then return ""
388 var end_index = from + count - 1
389 return substring_impl(from, count, end_index)
390 end
391
392 private fun substring_impl(from, count, end_index: Int): String do
393 var cache = _position
394 var dfrom = (cache - from).abs
395 var dend = (end_index - from).abs
396
397 var bytefrom: Int
398 var byteto: Int
399 if dfrom < dend then
400 bytefrom = char_to_byte_index(from)
401 byteto = char_to_byte_index(end_index)
402 else
403 byteto = char_to_byte_index(end_index)
404 bytefrom = char_to_byte_index(from)
405 end
406
407 var its = _items
408 byteto += its.length_of_char_at(byteto) - 1
409
410 var s = new FlatString.full(its, byteto - bytefrom + 1, bytefrom, count)
411 return s
412 end
413
414 redef fun empty do return "".as(FlatString)
415
416 redef fun to_upper
417 do
418 var outstr = new FlatBuffer.with_capacity(self._bytelen + 1)
419
420 var mylen = _length
421 var pos = 0
422
423 while pos < mylen do
424 outstr.add(chars[pos].to_upper)
425 pos += 1
426 end
427
428 return outstr.to_s
429 end
430
431 redef fun to_lower
432 do
433 var outstr = new FlatBuffer.with_capacity(self._bytelen + 1)
434
435 var mylen = _length
436 var pos = 0
437
438 while pos < mylen do
439 outstr.add(chars[pos].to_lower)
440 pos += 1
441 end
442
443 return outstr.to_s
444 end
445
446 redef fun output
447 do
448 for i in chars do i.output
449 end
450
451 ##################################################
452 # String Specific Methods #
453 ##################################################
454
455 # Low-level creation of a new string with minimal data.
456 #
457 # `_items` will be used as is, without copy, to retrieve the characters of the string.
458 # Aliasing issues is the responsibility of the caller.
459 private init with_infos(items: NativeString, bytelen, from: Int)
460 do
461 self._items = items
462 self._bytelen = bytelen
463 _first_byte = from
464 _bytepos = from
465 _length = _items.utf8_length(_first_byte, bytelen)
466 end
467
468 # Low-level creation of a new string with all the data.
469 #
470 # `_items` will be used as is, without copy, to retrieve the characters of the string.
471 # Aliasing issues is the responsibility of the caller.
472 private init full(items: NativeString, bytelen, from, length: Int)
473 do
474 self._items = items
475 self._length = length
476 self._bytelen = bytelen
477 _first_byte = from
478 _bytepos = from
479 end
480
481 redef fun ==(other)
482 do
483 if not other isa FlatText then return super
484
485 if self.object_id == other.object_id then return true
486
487 var my_length = _bytelen
488
489 if other._bytelen != my_length then return false
490
491 var my_index = _first_byte
492 var its_index = other.first_byte
493
494 var last_iteration = my_index + my_length
495
496 var its_items = other._items
497 var my_items = self._items
498
499 while my_index < last_iteration do
500 if my_items[my_index] != its_items[its_index] then return false
501 my_index += 1
502 its_index += 1
503 end
504
505 return true
506 end
507
508 redef fun <(other)
509 do
510 if not other isa FlatText then return super
511
512 if self.object_id == other.object_id then return false
513
514 var myits = _items
515 var itsits = other._items
516
517 var mbt = _bytelen
518 var obt = other.bytelen
519
520 var minln = if mbt < obt then mbt else obt
521 var mst = _first_byte
522 var ost = other.first_byte
523
524 for i in [0 .. minln[ do
525 var my_curr_char = myits[mst]
526 var its_curr_char = itsits[ost]
527
528 if my_curr_char > its_curr_char then return false
529 if my_curr_char < its_curr_char then return true
530
531 mst += 1
532 ost += 1
533 end
534
535 return mbt < obt
536 end
537
538 redef fun +(o) do
539 var s = o.to_s
540 var slen = s.bytelen
541 var mlen = _bytelen
542 var nlen = mlen + slen
543 var mits = _items
544 var mifrom = _first_byte
545 if s isa FlatText then
546 var sits = s._items
547 var sifrom = s.first_byte
548 var ns = new NativeString(nlen + 1)
549 mits.copy_to(ns, mlen, mifrom, 0)
550 sits.copy_to(ns, slen, sifrom, mlen)
551 return new FlatString.full(ns, nlen, 0, _length + o.length)
552 else
553 abort
554 end
555 end
556
557 redef fun *(i) do
558 var mybtlen = _bytelen
559 var new_bytelen = mybtlen * i
560 var mylen = _length
561 var newlen = mylen * i
562 var its = _items
563 var fb = _first_byte
564 var ns = new NativeString(new_bytelen + 1)
565 ns[new_bytelen] = 0u8
566 var offset = 0
567 while i > 0 do
568 its.copy_to(ns, mybtlen, fb, offset)
569 offset += mybtlen
570 i -= 1
571 end
572 return new FlatString.full(ns, new_bytelen, 0, newlen)
573 end
574
575
576 redef fun hash
577 do
578 if hash_cache == null then
579 # djb2 hash algorithm
580 var h = 5381
581 var i = _first_byte
582
583 var my_items = _items
584 var max = last_byte
585
586 while i <= max do
587 h = (h << 5) + h + my_items[i].to_i
588 i += 1
589 end
590
591 hash_cache = h
592 end
593
594 return hash_cache.as(not null)
595 end
596
597 redef fun substrings do return new FlatSubstringsIter(self)
598 end
599
600 private class FlatStringCharReverseIterator
601 super IndexedIterator[Char]
602
603 var target: FlatString
604
605 var curr_pos: Int
606
607 redef fun is_ok do return curr_pos >= 0
608
609 redef fun item do return target[curr_pos]
610
611 redef fun next do curr_pos -= 1
612
613 redef fun index do return curr_pos
614
615 end
616
617 private class FlatStringCharIterator
618 super IndexedIterator[Char]
619
620 var target: FlatString
621
622 var max: Int is noautoinit
623
624 var curr_pos: Int
625
626 init do max = target._length - 1
627
628 redef fun is_ok do return curr_pos <= max
629
630 redef fun item do return target[curr_pos]
631
632 redef fun next do curr_pos += 1
633
634 redef fun index do return curr_pos
635
636 end
637
638 private class FlatStringCharView
639 super StringCharView
640
641 redef type SELFTYPE: FlatString
642
643 redef fun [](index) do return target[index]
644
645 redef fun iterator_from(start) do return new FlatStringCharIterator(target, start)
646
647 redef fun reverse_iterator_from(start) do return new FlatStringCharReverseIterator(target, start)
648
649 end
650
651 private class FlatStringByteReverseIterator
652 super IndexedIterator[Byte]
653
654 var target: FlatString
655
656 var target_items: NativeString is noautoinit
657
658 var curr_pos: Int
659
660 init
661 do
662 var tgt = target
663 target_items = tgt._items
664 curr_pos += tgt._first_byte
665 end
666
667 redef fun is_ok do return curr_pos >= target._first_byte
668
669 redef fun item do return target_items[curr_pos]
670
671 redef fun next do curr_pos -= 1
672
673 redef fun index do return curr_pos - target._first_byte
674
675 end
676
677 private class FlatStringByteIterator
678 super IndexedIterator[Byte]
679
680 var target: FlatString
681
682 var target_items: NativeString is noautoinit
683
684 var curr_pos: Int
685
686 init
687 do
688 var tgt = target
689 target_items = tgt._items
690 curr_pos += tgt._first_byte
691 end
692
693 redef fun is_ok do return curr_pos <= target.last_byte
694
695 redef fun item do return target_items[curr_pos]
696
697 redef fun next do curr_pos += 1
698
699 redef fun index do return curr_pos - target._first_byte
700
701 end
702
703 private class FlatStringByteView
704 super StringByteView
705
706 redef type SELFTYPE: FlatString
707
708 redef fun [](index)
709 do
710 # Check that the index (+ _first_byte) is not larger than last_byte
711 # In other terms, if the index is valid
712 var target = _target
713 assert index >= 0 and index < target._bytelen
714 var ind = index + target._first_byte
715 return target._items[ind]
716 end
717
718 redef fun iterator_from(start) do return new FlatStringByteIterator(target, start)
719
720 redef fun reverse_iterator_from(start) do return new FlatStringByteReverseIterator(target, start)
721
722 end
723
724 redef class Buffer
725 redef new do return new FlatBuffer
726
727 redef new with_cap(i) do return new FlatBuffer.with_capacity(i)
728 end
729
730 # Mutable strings of characters.
731 class FlatBuffer
732 super FlatText
733 super Buffer
734
735 redef var chars: Sequence[Char] = new FlatBufferCharView(self) is lazy
736
737 redef var bytes = new FlatBufferByteView(self) is lazy
738
739 private var char_cache: Int = -1
740
741 private var byte_cache: Int = -1
742
743 private var capacity = 0
744
745 # Real items, used as cache for when to_cstring is called
746 private var real_items: NativeString is noinit
747
748 redef fun fast_cstring do return _items.fast_cstring(0)
749
750 redef fun substrings do return new FlatSubstringsIter(self)
751
752 # Re-copies the `NativeString` into a new one and sets it as the new `Buffer`
753 #
754 # This happens when an operation modifies the current `Buffer` and
755 # the Copy-On-Write flag `written` is set at true.
756 private fun reset do
757 var nns = new NativeString(capacity)
758 if _bytelen != 0 then _items.copy_to(nns, _bytelen, 0, 0)
759 _items = nns
760 written = false
761 end
762
763 # Shifts the content of the buffer by `len` bytes to the right, starting at byte `from`
764 #
765 # Internal only, does not modify _bytelen or length, this is the caller's responsability
766 private fun rshift_bytes(from: Int, len: Int) do
767 var oit = _items
768 var nit = _items
769 var bt = _bytelen
770 if bt + len > capacity then
771 capacity = capacity * 2 + 2
772 nit = new NativeString(capacity)
773 oit.copy_to(nit, 0, 0, from)
774 end
775 oit.copy_to(nit, bt - from, from, from + len)
776 end
777
778 # Shifts the content of the buffer by `len` bytes to the left, starting at `from`
779 #
780 # Internal only, does not modify _bytelen or length, this is the caller's responsability
781 private fun lshift_bytes(from: Int, len: Int) do
782 var it = _items
783 it.copy_to(it, _bytelen - from, from, from - len)
784 end
785
786 redef fun []=(index, item)
787 do
788 assert index >= 0 and index <= _length
789 if written then reset
790 is_dirty = true
791 if index == _length then
792 add item
793 return
794 end
795 var it = _items
796 var ip = it.char_to_byte_index(index)
797 var c = it.char_at(ip)
798 var clen = c.u8char_len
799 var itemlen = item.u8char_len
800 var size_diff = itemlen - clen
801 if size_diff > 0 then
802 rshift_bytes(ip + clen, size_diff)
803 else if size_diff < 0 then
804 lshift_bytes(ip + clen, -size_diff)
805 end
806 _bytelen += size_diff
807 it.set_char_at(ip, item)
808 end
809
810 redef fun add(c)
811 do
812 if written then reset
813 is_dirty = true
814 var clen = c.u8char_len
815 var bt = _bytelen
816 enlarge(bt + clen)
817 _items.set_char_at(bt, c)
818 _bytelen += clen
819 _length += 1
820 end
821
822 redef fun clear do
823 is_dirty = true
824 if written then reset
825 _bytelen = 0
826 _length = 0
827 end
828
829 redef fun empty do return new Buffer
830
831 redef fun enlarge(cap)
832 do
833 var c = capacity
834 if cap <= c then return
835 while c <= cap do c = c * 2 + 2
836 # The COW flag can be set at false here, since
837 # it does a copy of the current `Buffer`
838 written = false
839 var bln = _bytelen
840 var a = new NativeString(c+1)
841 if bln > 0 then
842 var it = _items
843 if bln > 0 then it.copy_to(a, bln, 0, 0)
844 end
845 _items = a
846 capacity = c
847 end
848
849 redef fun to_s
850 do
851 written = true
852 var bln = _bytelen
853 if bln == 0 then _items = new NativeString(1)
854 return new FlatString.full(_items, bln, 0, _length)
855 end
856
857 redef fun to_cstring
858 do
859 if is_dirty then
860 var bln = _bytelen
861 var new_native = new NativeString(bln + 1)
862 new_native[bln] = 0u8
863 if _length > 0 then _items.copy_to(new_native, bln, 0, 0)
864 real_items = new_native
865 is_dirty = false
866 end
867 return real_items
868 end
869
870 # Create a new empty string.
871 init do end
872
873 # Low-level creation a new buffer with given data.
874 #
875 # `_items` will be used as is, without copy, to store the characters of the buffer.
876 # Aliasing issues is the responsibility of the caller.
877 #
878 # If `_items` is shared, `written` should be set to true after the creation
879 # so that a modification will do a copy-on-write.
880 private init with_infos(items: NativeString, capacity, bytelen, length: Int)
881 do
882 self._items = items
883 self.capacity = capacity
884 self._bytelen = bytelen
885 self._length = length
886 end
887
888 # Create a new string copied from `s`.
889 init from(s: Text)
890 do
891 _items = new NativeString(s.bytelen)
892 if s isa FlatText then
893 _items = s._items
894 else
895 for i in substrings do i.as(FlatString)._items.copy_to(_items, i._bytelen, 0, 0)
896 end
897 _bytelen = s.bytelen
898 _length = s.length
899 _capacity = _bytelen
900 written = true
901 end
902
903 # Create a new empty string with a given capacity.
904 init with_capacity(cap: Int)
905 do
906 assert cap >= 0
907 _items = new NativeString(cap + 1)
908 capacity = cap
909 _bytelen = 0
910 end
911
912 redef fun append(s)
913 do
914 if s.is_empty then return
915 is_dirty = true
916 var sl = s.bytelen
917 var nln = _bytelen + sl
918 enlarge(nln)
919 if s isa FlatText then
920 s._items.copy_to(_items, sl, s.first_byte, _bytelen)
921 else
922 for i in s.substrings do append i
923 return
924 end
925 _bytelen = nln
926 _length += s.length
927 end
928
929 # Copies the content of self in `dest`
930 fun copy(start: Int, len: Int, dest: Buffer, new_start: Int)
931 do
932 var self_chars = self.chars
933 var dest_chars = dest.chars
934 for i in [0..len-1] do
935 dest_chars[new_start+i] = self_chars[start+i]
936 end
937 end
938
939 redef fun substring(from, count)
940 do
941 assert count >= 0
942 if from < 0 then from = 0
943 if (from + count) > _length then count = _length - from
944 if count <= 0 then return new Buffer
945 var its = _items
946 var bytefrom = its.char_to_byte_index(from)
947 var byteto = its.char_to_byte_index(count + from - 1)
948 byteto += its.char_at(byteto).u8char_len - 1
949 var byte_length = byteto - bytefrom + 1
950 var r_items = new NativeString(byte_length)
951 its.copy_to(r_items, byte_length, bytefrom, 0)
952 return new FlatBuffer.with_infos(r_items, byte_length, byte_length, count)
953 end
954
955 redef fun reverse
956 do
957 written = false
958 var ns = new FlatBuffer.with_capacity(capacity)
959 for i in chars.reverse_iterator do ns.add i
960 _items = ns._items
961 end
962
963 redef fun times(repeats)
964 do
965 var bln = _bytelen
966 var x = new FlatString.full(_items, bln, 0, _length)
967 for i in [1 .. repeats[ do
968 append(x)
969 end
970 end
971
972 redef fun upper
973 do
974 if written then reset
975 for i in [0 .. _length[ do self[i] = self[i].to_upper
976 end
977
978 redef fun lower
979 do
980 if written then reset
981 for i in [0 .. _length[ do self[i] = self[i].to_lower
982 end
983 end
984
985 private class FlatBufferByteReverseIterator
986 super IndexedIterator[Byte]
987
988 var target: FlatBuffer
989
990 var target_items: NativeString is noautoinit
991
992 var curr_pos: Int
993
994 init do target_items = target._items
995
996 redef fun index do return curr_pos
997
998 redef fun is_ok do return curr_pos >= 0
999
1000 redef fun item do return target_items[curr_pos]
1001
1002 redef fun next do curr_pos -= 1
1003
1004 end
1005
1006 private class FlatBufferByteView
1007 super BufferByteView
1008
1009 redef type SELFTYPE: FlatBuffer
1010
1011 redef fun [](index) do return target._items[index]
1012
1013 redef fun iterator_from(pos) do return new FlatBufferByteIterator(target, pos)
1014
1015 redef fun reverse_iterator_from(pos) do return new FlatBufferByteReverseIterator(target, pos)
1016
1017 end
1018
1019 private class FlatBufferByteIterator
1020 super IndexedIterator[Byte]
1021
1022 var target: FlatBuffer
1023
1024 var target_items: NativeString is noautoinit
1025
1026 var curr_pos: Int
1027
1028 init do target_items = target._items
1029
1030 redef fun index do return curr_pos
1031
1032 redef fun is_ok do return curr_pos < target._bytelen
1033
1034 redef fun item do return target_items[curr_pos]
1035
1036 redef fun next do curr_pos += 1
1037
1038 end
1039
1040 private class FlatBufferCharReverseIterator
1041 super IndexedIterator[Char]
1042
1043 var target: FlatBuffer
1044
1045 var curr_pos: Int
1046
1047 redef fun index do return curr_pos
1048
1049 redef fun is_ok do return curr_pos >= 0
1050
1051 redef fun item do return target[curr_pos]
1052
1053 redef fun next do curr_pos -= 1
1054
1055 end
1056
1057 private class FlatBufferCharView
1058 super BufferCharView
1059
1060 redef type SELFTYPE: FlatBuffer
1061
1062 redef fun [](index) do return target[index]
1063
1064 redef fun []=(index, item)
1065 do
1066 assert index >= 0 and index <= length
1067 if index == length then
1068 add(item)
1069 return
1070 end
1071 target[index] = item
1072 end
1073
1074 redef fun push(c)
1075 do
1076 target.add(c)
1077 end
1078
1079 redef fun add(c)
1080 do
1081 target.add(c)
1082 end
1083
1084 fun enlarge(cap: Int)
1085 do
1086 target.enlarge(cap)
1087 end
1088
1089 redef fun append(s)
1090 do
1091 var s_length = s.length
1092 if target.capacity < s.length then enlarge(s_length + target._length)
1093 for i in s do target.add i
1094 end
1095
1096 redef fun iterator_from(pos) do return new FlatBufferCharIterator(target, pos)
1097
1098 redef fun reverse_iterator_from(pos) do return new FlatBufferCharReverseIterator(target, pos)
1099
1100 end
1101
1102 private class FlatBufferCharIterator
1103 super IndexedIterator[Char]
1104
1105 var target: FlatBuffer
1106
1107 var max: Int is noautoinit
1108
1109 var curr_pos: Int
1110
1111 init do max = target._length - 1
1112
1113 redef fun index do return curr_pos
1114
1115 redef fun is_ok do return curr_pos <= max
1116
1117 redef fun item do return target[curr_pos]
1118
1119 redef fun next do curr_pos += 1
1120
1121 end
1122
1123 redef class NativeString
1124 redef fun to_s
1125 do
1126 return to_s_with_length(cstring_length)
1127 end
1128
1129 # Returns `self` as a String of `length`.
1130 redef fun to_s_with_length(length): FlatString
1131 do
1132 assert length >= 0
1133 return clean_utf8(length)
1134 end
1135
1136 redef fun to_s_full(bytelen, unilen) do
1137 return new FlatString.full(self, bytelen, 0, unilen)
1138 end
1139
1140 redef fun to_s_unsafe(len) do
1141 if len == null then len = cstring_length
1142 return new FlatString.with_infos(self, len, 0)
1143 end
1144
1145 # Returns `self` as a new String.
1146 redef fun to_s_with_copy: FlatString
1147 do
1148 var length = cstring_length
1149 var r = clean_utf8(length)
1150 if r.items != self then return r
1151 var new_self = new NativeString(length + 1)
1152 copy_to(new_self, length, 0, 0)
1153 var str = new FlatString.with_infos(new_self, length, 0)
1154 new_self[length] = 0u8
1155 str.to_cstring = new_self
1156 return str
1157 end
1158
1159 # Cleans a NativeString if necessary
1160 fun clean_utf8(len: Int): FlatString do
1161 var replacements: nullable Array[Int] = null
1162 var end_length = len
1163 var pos = 0
1164 var chr_ln = 0
1165 var rem = len
1166 while rem > 0 do
1167 while rem >= 4 do
1168 var i = fetch_4_chars(pos)
1169 if i & 0x80808080 != 0 then break
1170 pos += 4
1171 chr_ln += 4
1172 rem -= 4
1173 end
1174 if rem == 0 then break
1175 var b = self[pos]
1176 if b & 0x80u8 == 0x00u8 then
1177 pos += 1
1178 chr_ln += 1
1179 rem -= 1
1180 continue
1181 end
1182 var nxst = length_of_char_at(pos)
1183 var ok_st: Bool
1184 if nxst == 1 then
1185 ok_st = b & 0x80u8 == 0u8
1186 else if nxst == 2 then
1187 ok_st = b & 0xE0u8 == 0xC0u8
1188 else if nxst == 3 then
1189 ok_st = b & 0xF0u8 == 0xE0u8
1190 else
1191 ok_st = b & 0xF8u8 == 0xF0u8
1192 end
1193 if not ok_st then
1194 if replacements == null then replacements = new Array[Int]
1195 replacements.add pos
1196 end_length += 2
1197 pos += 1
1198 rem -= 1
1199 chr_ln += 1
1200 continue
1201 end
1202 var ok_c: Bool
1203 var c = char_at(pos)
1204 var cp = c.code_point
1205 if nxst == 1 then
1206 ok_c = cp >= 0 and cp <= 0x7F
1207 else if nxst == 2 then
1208 ok_c = cp >= 0x80 and cp <= 0x7FF
1209 else if nxst == 3 then
1210 ok_c = cp >= 0x800 and cp <= 0xFFFF
1211 ok_c = ok_c and not (cp >= 0xD800 and cp <= 0xDFFF) and cp != 0xFFFE and cp != 0xFFFF
1212 else
1213 ok_c = cp >= 0x10000 and cp <= 0x10FFFF
1214 end
1215 if not ok_c then
1216 if replacements == null then replacements = new Array[Int]
1217 replacements.add pos
1218 end_length += 2
1219 pos += 1
1220 chr_ln += 1
1221 rem -= 1
1222 continue
1223 end
1224 var clen = c.u8char_len
1225 pos += clen
1226 rem -= clen
1227 chr_ln += 1
1228 end
1229 var ret = self
1230 if end_length != len then
1231 ret = new NativeString(end_length)
1232 var old_repl = 0
1233 var off = 0
1234 var repls = replacements.as(not null)
1235 var r = repls.items.as(not null)
1236 var imax = repls.length
1237 for i in [0 .. imax[ do
1238 var repl_pos = r[i]
1239 var chkln = repl_pos - old_repl
1240 copy_to(ret, chkln, old_repl, off)
1241 off += chkln
1242 ret[off] = 0xEFu8
1243 ret[off + 1] = 0xBFu8
1244 ret[off + 2] = 0xBDu8
1245 old_repl = repl_pos + 1
1246 off += 3
1247 end
1248 copy_to(ret, len - old_repl, old_repl, off)
1249 end
1250 return new FlatString.full(ret, end_length, 0, chr_ln)
1251 end
1252
1253 # Sets the next bytes at position `pos` to the value of `c`, encoded in UTF-8
1254 #
1255 # Very unsafe, make sure to have room for this char prior to calling this function.
1256 private fun set_char_at(pos: Int, c: Char) do
1257 if c.code_point < 128 then
1258 self[pos] = c.code_point.to_b
1259 return
1260 end
1261 var ln = c.u8char_len
1262 native_set_char(pos, c, ln)
1263 end
1264
1265 private fun native_set_char(pos: Int, c: Char, ln: Int) `{
1266 char* dst = self + pos;
1267 switch(ln){
1268 case 1:
1269 dst[0] = c;
1270 break;
1271 case 2:
1272 dst[0] = 0xC0 | ((c & 0x7C0) >> 6);
1273 dst[1] = 0x80 | (c & 0x3F);
1274 break;
1275 case 3:
1276 dst[0] = 0xE0 | ((c & 0xF000) >> 12);
1277 dst[1] = 0x80 | ((c & 0xFC0) >> 6);
1278 dst[2] = 0x80 | (c & 0x3F);
1279 break;
1280 case 4:
1281 dst[0] = 0xF0 | ((c & 0x1C0000) >> 18);
1282 dst[1] = 0x80 | ((c & 0x3F000) >> 12);
1283 dst[2] = 0x80 | ((c & 0xFC0) >> 6);
1284 dst[3] = 0x80 | (c & 0x3F);
1285 break;
1286 }
1287 `}
1288 end
1289
1290 redef class Int
1291 redef fun to_base(base, signed)
1292 do
1293 var l = digit_count(base)
1294 var s = new FlatBuffer.from(" " * l)
1295 fill_buffer(s, base, signed)
1296 return s.to_s
1297 end
1298
1299 # return displayable int in base 10 and signed
1300 #
1301 # assert 1.to_s == "1"
1302 # assert (-123).to_s == "-123"
1303 redef fun to_s do
1304 # Fast case for common numbers
1305 if self == 0 then return "0"
1306 if self == 1 then return "1"
1307
1308 var nslen = int_to_s_len
1309 var ns = new NativeString(nslen + 1)
1310 ns[nslen] = 0u8
1311 native_int_to_s(ns, nslen + 1)
1312 return new FlatString.full(ns, nslen, 0, nslen)
1313 end
1314 end
1315
1316 redef class Array[E]
1317
1318 # Fast implementation
1319 redef fun plain_to_s
1320 do
1321 var l = _length
1322 if l == 0 then return ""
1323 var its = _items.as(not null)
1324 var first = its[0]
1325 if l == 1 then if first == null then return "" else return first.to_s
1326 var na = new NativeArray[String](l)
1327 var i = 0
1328 var sl = 0
1329 var mypos = 0
1330 while i < l do
1331 var itsi = its[i]
1332 if itsi == null then
1333 i += 1
1334 continue
1335 end
1336 var tmp = itsi.to_s
1337 sl += tmp.bytelen
1338 na[mypos] = tmp
1339 i += 1
1340 mypos += 1
1341 end
1342 var ns = new NativeString(sl + 1)
1343 ns[sl] = 0u8
1344 i = 0
1345 var off = 0
1346 while i < mypos do
1347 var tmp = na[i]
1348 if tmp isa FlatString then
1349 var tpl = tmp._bytelen
1350 tmp._items.copy_to(ns, tpl, tmp._first_byte, off)
1351 off += tpl
1352 else
1353 for j in tmp.substrings do
1354 var s = j.as(FlatString)
1355 var slen = s._bytelen
1356 s._items.copy_to(ns, slen, s._first_byte, off)
1357 off += slen
1358 end
1359 end
1360 i += 1
1361 end
1362 return new FlatString.with_infos(ns, sl, 0)
1363 end
1364 end
1365
1366 redef class NativeArray[E]
1367 redef fun native_to_s do
1368 assert self isa NativeArray[String]
1369 var l = length
1370 var na = self
1371 var i = 0
1372 var sl = 0
1373 var mypos = 0
1374 while i < l do
1375 sl += na[i].bytelen
1376 i += 1
1377 mypos += 1
1378 end
1379 var ns = new NativeString(sl + 1)
1380 ns[sl] = 0u8
1381 i = 0
1382 var off = 0
1383 while i < mypos do
1384 var tmp = na[i]
1385 if tmp isa FlatString then
1386 var tpl = tmp._bytelen
1387 tmp._items.copy_to(ns, tpl, tmp._first_byte, off)
1388 off += tpl
1389 else
1390 for j in tmp.substrings do
1391 var s = j.as(FlatString)
1392 var slen = s._bytelen
1393 s._items.copy_to(ns, slen, s._first_byte, off)
1394 off += slen
1395 end
1396 end
1397 i += 1
1398 end
1399 return new FlatString.with_infos(ns, sl, 0)
1400 end
1401 end
1402
1403 redef class Map[K,V]
1404 redef fun join(sep, couple_sep)
1405 do
1406 if is_empty then return ""
1407
1408 var s = new Buffer # Result
1409
1410 # Concat first item
1411 var i = iterator
1412 var k = i.key
1413 var e = i.item
1414 s.append("{k or else "<null>"}{couple_sep}{e or else "<null>"}")
1415
1416 # Concat other _items
1417 i.next
1418 while i.is_ok do
1419 s.append(sep)
1420 k = i.key
1421 e = i.item
1422 s.append("{k or else "<null>"}{couple_sep}{e or else "<null>"}")
1423 i.next
1424 end
1425 return s.to_s
1426 end
1427 end