lib/core: remove duplicates and improve doc of `NativeString` related services
[nit.git] / lib / core / text / flat.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # This file is free software, which comes along with NIT. This software is
4 # distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
5 # without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
6 # PARTICULAR PURPOSE. You can modify it is you want, provided this header
7 # is kept unaltered, and a notification of the changes is added.
8 # You are allowed to redistribute it and sell it, alone or is a part of
9 # another product.
10
11 # All the array-based text representations
12 module flat
13
14 intrude import abstract_text
15 intrude import native
16
17 `{
18 #include <stdio.h>
19 #include <string.h>
20 `}
21
22 private class FlatSubstringsIter
23 super Iterator[FlatText]
24
25 var tgt: nullable FlatText
26
27 redef fun item do
28 assert is_ok
29 return tgt.as(not null)
30 end
31
32 redef fun is_ok do return tgt != null
33
34 redef fun next do tgt = null
35 end
36
37 redef class FlatText
38
39 # First byte of the NativeString
40 protected fun first_byte: Int do return 0
41
42 # Last byte of the NativeString
43 protected fun last_byte: Int do return first_byte + _bytelen - 1
44
45 # Cache of the latest position (char) explored in the string
46 var position: Int = 0
47
48 # Cached position (bytes) in the NativeString underlying the String
49 var bytepos: Int = 0
50
51 # Index of the character `index` in `_items`
52 fun char_to_byte_index(index: Int): Int do
53 var dpos = index - _position
54 var b = _bytepos
55 var its = _items
56
57 if dpos == 1 then
58 if its[b] & 0x80u8 == 0x00u8 then
59 b += 1
60 else
61 b += its.length_of_char_at(b)
62 end
63 _bytepos = b
64 _position = index
65 return b
66 end
67 if dpos == -1 then
68 b = its.find_beginning_of_char_at(b - 1)
69 _bytepos = b
70 _position = index
71 return b
72 end
73 if dpos == 0 then return b
74
75 var ln = _length
76 var pos = _position
77 # Find best insertion point
78 var delta_begin = index
79 var delta_end = (ln - 1) - index
80 var delta_cache = (pos - index).abs
81 var min = delta_begin
82
83 if delta_cache < min then min = delta_cache
84 if delta_end < min then min = delta_end
85
86 var ns_i: Int
87 var my_i: Int
88
89 if min == delta_cache then
90 ns_i = _bytepos
91 my_i = pos
92 else if min == delta_begin then
93 ns_i = first_byte
94 my_i = 0
95 else
96 ns_i = its.find_beginning_of_char_at(last_byte)
97 my_i = _length - 1
98 end
99
100 ns_i = its.char_to_byte_index_cached(index, my_i, ns_i)
101
102 _position = index
103 _bytepos = ns_i
104
105 return ns_i
106 end
107
108 # By escaping `self` to HTML, how many more bytes will be needed ?
109 fun chars_to_html_escape: Int do
110 var its = _items
111 var max = last_byte
112 var pos = first_byte
113 var endlen = 0
114 while pos <= max do
115 var c = its[pos]
116 if c == 0x3Cu8 then
117 endlen += 3
118 else if c == 0x3Eu8 then
119 endlen += 3
120 else if c == 0x26u8 then
121 endlen += 4
122 else if c == 0x22u8 then
123 endlen += 4
124 else if c == 0x27u8 then
125 endlen += 4
126 else if c == 0x2Fu8 then
127 endlen += 4
128 end
129 pos += 1
130 end
131 return endlen
132 end
133
134 redef fun html_escape
135 do
136 var extra = chars_to_html_escape
137 if extra == 0 then return to_s
138 var its = _items
139 var max = last_byte
140 var pos = first_byte
141 var nlen = extra + _bytelen
142 var nits = new NativeString(nlen)
143 var outpos = 0
144 while pos <= max do
145 var c = its[pos]
146 # Special codes:
147 # Some HTML characters are used as meta-data, they need
148 # to be replaced by an HTML-Escaped equivalent
149 #
150 # * 0x3C (<) => &lt;
151 # * 0x3E (>) => &gt;
152 # * 0x26 (&) => &amp;
153 # * 0x22 (") => &#34;
154 # * 0x27 (') => &#39;
155 # * 0x2F (/) => &#47;
156 if c == 0x3Cu8 then
157 nits[outpos] = 0x26u8
158 nits[outpos + 1] = 0x6Cu8
159 nits[outpos + 2] = 0x74u8
160 nits[outpos + 3] = 0x3Bu8
161 outpos += 4
162 else if c == 0x3Eu8 then
163 nits[outpos] = 0x26u8
164 nits[outpos + 1] = 0x67u8
165 nits[outpos + 2] = 0x74u8
166 nits[outpos + 3] = 0x3Bu8
167 outpos += 4
168 else if c == 0x26u8 then
169 nits[outpos] = 0x26u8
170 nits[outpos + 1] = 0x61u8
171 nits[outpos + 2] = 0x6Du8
172 nits[outpos + 3] = 0x70u8
173 nits[outpos + 4] = 0x3Bu8
174 outpos += 5
175 else if c == 0x22u8 then
176 nits[outpos] = 0x26u8
177 nits[outpos + 1] = 0x23u8
178 nits[outpos + 2] = 0x33u8
179 nits[outpos + 3] = 0x34u8
180 nits[outpos + 4] = 0x3Bu8
181 outpos += 5
182 else if c == 0x27u8 then
183 nits[outpos] = 0x26u8
184 nits[outpos + 1] = 0x23u8
185 nits[outpos + 2] = 0x33u8
186 nits[outpos + 3] = 0x39u8
187 nits[outpos + 4] = 0x3Bu8
188 outpos += 5
189 else if c == 0x2Fu8 then
190 nits[outpos] = 0x26u8
191 nits[outpos + 1] = 0x23u8
192 nits[outpos + 2] = 0x34u8
193 nits[outpos + 3] = 0x37u8
194 nits[outpos + 4] = 0x3Bu8
195 outpos += 5
196 else
197 nits[outpos] = c
198 outpos += 1
199 end
200 pos += 1
201 end
202 var s = new FlatString.with_infos(nits, nlen, 0)
203 return s
204 end
205
206 # By escaping `self` to C, how many more bytes will be needed ?
207 #
208 # This enables a double-optimization in `escape_to_c` since if this
209 # method returns 0, then `self` does not need escaping and can be
210 # returned as-is
211 fun chars_to_escape_to_c: Int do
212 var its = _items
213 var max = last_byte
214 var pos = first_byte
215 var req_esc = 0
216 while pos <= max do
217 var c = its[pos]
218 if c == 0x0Au8 then
219 req_esc += 1
220 else if c == 0x09u8 then
221 req_esc += 1
222 else if c == 0x22u8 then
223 req_esc += 1
224 else if c == 0x27u8 then
225 req_esc += 1
226 else if c == 0x5Cu8 then
227 req_esc += 1
228 else if c < 32u8 then
229 req_esc += 3
230 end
231 pos += 1
232 end
233 return req_esc
234 end
235
236 redef fun escape_to_c do
237 var ln_extra = chars_to_escape_to_c
238 if ln_extra == 0 then return self.to_s
239 var its = _items
240 var max = last_byte
241 var nlen = _bytelen + ln_extra
242 var nns = new NativeString(nlen)
243 var pos = first_byte
244 var opos = 0
245 while pos <= max do
246 var c = its[pos]
247 # Special codes:
248 #
249 # Any byte with value < 32 is a control character
250 # All their uses will be replaced by their octal
251 # value in C.
252 #
253 # There are two exceptions however:
254 #
255 # * 0x09 => \t
256 # * 0x0A => \n
257 #
258 # Aside from the code points above, the following are:
259 #
260 # * 0x22 => \"
261 # * 0x27 => \'
262 # * 0x5C => \\
263 if c == 0x09u8 then
264 nns[opos] = 0x5Cu8
265 nns[opos + 1] = 0x74u8
266 opos += 2
267 else if c == 0x0Au8 then
268 nns[opos] = 0x5Cu8
269 nns[opos + 1] = 0x6Eu8
270 opos += 2
271 else if c == 0x22u8 then
272 nns[opos] = 0x5Cu8
273 nns[opos + 1] = 0x22u8
274 opos += 2
275 else if c == 0x27u8 then
276 nns[opos] = 0x5Cu8
277 nns[opos + 1] = 0x27u8
278 opos += 2
279 else if c == 0x5Cu8 then
280 nns[opos] = 0x5Cu8
281 nns[opos + 1] = 0x5Cu8
282 opos += 2
283 else if c < 32u8 then
284 nns[opos] = 0x5Cu8
285 nns[opos + 1] = 0x30u8
286 nns[opos + 2] = ((c & 0x38u8) >> 3) + 0x30u8
287 nns[opos + 3] = (c & 0x07u8) + 0x30u8
288 opos += 4
289 else
290 nns[opos] = c
291 opos += 1
292 end
293 pos += 1
294 end
295 return nns.to_s_unsafe(nlen)
296 end
297
298 redef fun [](index) do
299 var len = _length
300
301 # Statistically:
302 # * ~70% want the next char
303 # * ~23% want the previous
304 # * ~7% want the same char
305 #
306 # So it makes sense to shortcut early. And early is here.
307 var dpos = index - _position
308 var b = _bytepos
309 if dpos == 1 and index < len - 1 then
310 var its = _items
311 var c = its[b]
312 if c & 0x80u8 == 0x00u8 then
313 # We want the next, and current is easy.
314 # So next is easy to find!
315 b += 1
316 _position = index
317 _bytepos = b
318 # The rest will be done by `dpos==0` bellow.
319 dpos = 0
320 end
321 else if dpos == -1 and index > 1 then
322 var its = _items
323 var c = its[b-1]
324 if c & 0x80u8 == 0x00u8 then
325 # We want the previous, and it is easy.
326 b -= 1
327 dpos = 0
328 _position = index
329 _bytepos = b
330 return c.ascii
331 end
332 end
333 if dpos == 0 then
334 # We know what we want (+0 or +1) just get it now!
335 var its = _items
336 var c = its[b]
337 if c & 0x80u8 == 0x00u8 then return c.ascii
338 return items.char_at(b)
339 end
340
341 assert index >= 0 and index < len
342 return fetch_char_at(index)
343 end
344
345 # Gets a `Char` at `index` in `self`
346 #
347 # WARNING: Use at your own risks as no bound-checking is done
348 fun fetch_char_at(index: Int): Char do
349 var i = char_to_byte_index(index)
350 var items = _items
351 var b = items[i]
352 if b & 0x80u8 == 0x00u8 then return b.ascii
353 return items.char_at(i)
354 end
355
356 # If `self` contains only digits and alpha <= 'f', return the corresponding integer.
357 #
358 # assert "ff".to_hex == 255
359 redef fun to_hex(pos, ln) do
360 var res = 0
361 if pos == null then pos = 0
362 if ln == null then ln = length - pos
363 pos = char_to_byte_index(pos)
364 var its = _items
365 var max = pos + ln
366 for i in [pos .. max[ do
367 res <<= 4
368 res += its[i].ascii.from_hex
369 end
370 return res
371 end
372 end
373
374 # Immutable strings of characters.
375 class FlatString
376 super FlatText
377 super String
378
379 # Index at which `self` begins in `_items`, inclusively
380 redef var first_byte is noinit
381
382 redef var chars = new FlatStringCharView(self) is lazy
383
384 redef var bytes = new FlatStringByteView(self) is lazy
385
386 redef var to_cstring is lazy do
387 var blen = _bytelen
388 var new_items = new NativeString(blen + 1)
389 _items.copy_to(new_items, blen, _first_byte, 0)
390 new_items[blen] = 0u8
391 return new_items
392 end
393
394 redef fun reversed do
395 var b = new FlatBuffer.with_capacity(_bytelen + 1)
396 var i = _length - 1
397 while i >= 0 do
398 b.add self.fetch_char_at(i)
399 i -= 1
400 end
401 var s = b.to_s.as(FlatString)
402 s._length = self._length
403 return s
404 end
405
406 redef fun fast_cstring do return _items.fast_cstring(_first_byte)
407
408 redef fun substring_from(from) do
409 if from >= self._length then return empty
410 if from <= 0 then return self
411 var c = char_to_byte_index(from)
412 var st = c - _first_byte
413 var fln = bytelen - st
414 return new FlatString.full(items, fln, c, _length - from)
415 end
416
417 redef fun substring(from, count)
418 do
419 if count <= 0 then return ""
420
421 if from < 0 then
422 count += from
423 if count < 0 then return ""
424 from = 0
425 end
426
427 var ln = _length
428 if (count + from) > ln then count = ln - from
429 if count <= 0 then return ""
430 var end_index = from + count - 1
431 return substring_impl(from, count, end_index)
432 end
433
434 private fun substring_impl(from, count, end_index: Int): String do
435 var cache = _position
436 var dfrom = (cache - from).abs
437 var dend = (end_index - from).abs
438
439 var bytefrom: Int
440 var byteto: Int
441 if dfrom < dend then
442 bytefrom = char_to_byte_index(from)
443 byteto = char_to_byte_index(end_index)
444 else
445 byteto = char_to_byte_index(end_index)
446 bytefrom = char_to_byte_index(from)
447 end
448
449 var its = _items
450 byteto += its.length_of_char_at(byteto) - 1
451
452 var s = new FlatString.full(its, byteto - bytefrom + 1, bytefrom, count)
453 return s
454 end
455
456 redef fun empty do return "".as(FlatString)
457
458 redef fun to_upper
459 do
460 var outstr = new FlatBuffer.with_capacity(self._bytelen + 1)
461
462 var mylen = _length
463 var pos = 0
464
465 while pos < mylen do
466 outstr.add(chars[pos].to_upper)
467 pos += 1
468 end
469
470 return outstr.to_s
471 end
472
473 redef fun to_lower
474 do
475 var outstr = new FlatBuffer.with_capacity(self._bytelen + 1)
476
477 var mylen = _length
478 var pos = 0
479
480 while pos < mylen do
481 outstr.add(chars[pos].to_lower)
482 pos += 1
483 end
484
485 return outstr.to_s
486 end
487
488 redef fun output
489 do
490 for i in chars do i.output
491 end
492
493 ##################################################
494 # String Specific Methods #
495 ##################################################
496
497 # Low-level creation of a new string with minimal data.
498 #
499 # `_items` will be used as is, without copy, to retrieve the characters of the string.
500 # Aliasing issues is the responsibility of the caller.
501 private init with_infos(items: NativeString, bytelen, from: Int)
502 do
503 self._items = items
504 self._bytelen = bytelen
505 _first_byte = from
506 _bytepos = from
507 _length = _items.utf8_length(_first_byte, bytelen)
508 end
509
510 # Low-level creation of a new string with all the data.
511 #
512 # `_items` will be used as is, without copy, to retrieve the characters of the string.
513 # Aliasing issues is the responsibility of the caller.
514 private init full(items: NativeString, bytelen, from, length: Int)
515 do
516 self._items = items
517 self._length = length
518 self._bytelen = bytelen
519 _first_byte = from
520 _bytepos = from
521 end
522
523 redef fun ==(other)
524 do
525 if not other isa FlatText then return super
526
527 if self.object_id == other.object_id then return true
528
529 var my_length = _bytelen
530
531 if other._bytelen != my_length then return false
532
533 var my_index = _first_byte
534 var its_index = other.first_byte
535
536 var last_iteration = my_index + my_length
537
538 var its_items = other._items
539 var my_items = self._items
540
541 while my_index < last_iteration do
542 if my_items[my_index] != its_items[its_index] then return false
543 my_index += 1
544 its_index += 1
545 end
546
547 return true
548 end
549
550 redef fun <(other)
551 do
552 if not other isa FlatText then return super
553
554 if self.object_id == other.object_id then return false
555
556 var myits = _items
557 var itsits = other._items
558
559 var mbt = _bytelen
560 var obt = other.bytelen
561
562 var minln = if mbt < obt then mbt else obt
563 var mst = _first_byte
564 var ost = other.first_byte
565
566 for i in [0 .. minln[ do
567 var my_curr_char = myits[mst]
568 var its_curr_char = itsits[ost]
569
570 if my_curr_char > its_curr_char then return false
571 if my_curr_char < its_curr_char then return true
572
573 mst += 1
574 ost += 1
575 end
576
577 return mbt < obt
578 end
579
580 redef fun +(o) do
581 var s = o.to_s
582 var slen = s.bytelen
583 var mlen = _bytelen
584 var nlen = mlen + slen
585 var mits = _items
586 var mifrom = _first_byte
587 if s isa FlatText then
588 var sits = s._items
589 var sifrom = s.first_byte
590 var ns = new NativeString(nlen + 1)
591 mits.copy_to(ns, mlen, mifrom, 0)
592 sits.copy_to(ns, slen, sifrom, mlen)
593 return new FlatString.full(ns, nlen, 0, _length + o.length)
594 else
595 abort
596 end
597 end
598
599 redef fun *(i) do
600 var mybtlen = _bytelen
601 var new_bytelen = mybtlen * i
602 var mylen = _length
603 var newlen = mylen * i
604 var its = _items
605 var fb = _first_byte
606 var ns = new NativeString(new_bytelen + 1)
607 ns[new_bytelen] = 0u8
608 var offset = 0
609 while i > 0 do
610 its.copy_to(ns, mybtlen, fb, offset)
611 offset += mybtlen
612 i -= 1
613 end
614 return new FlatString.full(ns, new_bytelen, 0, newlen)
615 end
616
617
618 redef fun hash
619 do
620 if hash_cache == null then
621 # djb2 hash algorithm
622 var h = 5381
623 var i = _first_byte
624
625 var my_items = _items
626 var max = last_byte
627
628 while i <= max do
629 h = (h << 5) + h + my_items[i].to_i
630 i += 1
631 end
632
633 hash_cache = h
634 end
635
636 return hash_cache.as(not null)
637 end
638
639 redef fun substrings do return new FlatSubstringsIter(self)
640 end
641
642 private class FlatStringCharReverseIterator
643 super IndexedIterator[Char]
644
645 var target: FlatString
646
647 var curr_pos: Int
648
649 redef fun is_ok do return curr_pos >= 0
650
651 redef fun item do return target[curr_pos]
652
653 redef fun next do curr_pos -= 1
654
655 redef fun index do return curr_pos
656
657 end
658
659 private class FlatStringCharIterator
660 super IndexedIterator[Char]
661
662 var target: FlatString
663
664 var max: Int is noautoinit
665
666 var curr_pos: Int
667
668 init do max = target._length - 1
669
670 redef fun is_ok do return curr_pos <= max
671
672 redef fun item do return target[curr_pos]
673
674 redef fun next do curr_pos += 1
675
676 redef fun index do return curr_pos
677
678 end
679
680 private class FlatStringCharView
681 super StringCharView
682
683 redef type SELFTYPE: FlatString
684
685 redef fun [](index) do return target[index]
686
687 redef fun iterator_from(start) do return new FlatStringCharIterator(target, start)
688
689 redef fun reverse_iterator_from(start) do return new FlatStringCharReverseIterator(target, start)
690
691 end
692
693 private class FlatStringByteReverseIterator
694 super IndexedIterator[Byte]
695
696 var target: FlatString
697
698 var target_items: NativeString is noautoinit
699
700 var curr_pos: Int
701
702 init
703 do
704 var tgt = target
705 target_items = tgt._items
706 curr_pos += tgt._first_byte
707 end
708
709 redef fun is_ok do return curr_pos >= target._first_byte
710
711 redef fun item do return target_items[curr_pos]
712
713 redef fun next do curr_pos -= 1
714
715 redef fun index do return curr_pos - target._first_byte
716
717 end
718
719 private class FlatStringByteIterator
720 super IndexedIterator[Byte]
721
722 var target: FlatString
723
724 var target_items: NativeString is noautoinit
725
726 var curr_pos: Int
727
728 init
729 do
730 var tgt = target
731 target_items = tgt._items
732 curr_pos += tgt._first_byte
733 end
734
735 redef fun is_ok do return curr_pos <= target.last_byte
736
737 redef fun item do return target_items[curr_pos]
738
739 redef fun next do curr_pos += 1
740
741 redef fun index do return curr_pos - target._first_byte
742
743 end
744
745 private class FlatStringByteView
746 super StringByteView
747
748 redef type SELFTYPE: FlatString
749
750 redef fun [](index)
751 do
752 # Check that the index (+ _first_byte) is not larger than last_byte
753 # In other terms, if the index is valid
754 var target = _target
755 assert index >= 0 and index < target._bytelen
756 var ind = index + target._first_byte
757 return target._items[ind]
758 end
759
760 redef fun iterator_from(start) do return new FlatStringByteIterator(target, start)
761
762 redef fun reverse_iterator_from(start) do return new FlatStringByteReverseIterator(target, start)
763
764 end
765
766 redef class Buffer
767 redef new do return new FlatBuffer
768
769 redef new with_cap(i) do return new FlatBuffer.with_capacity(i)
770 end
771
772 # Mutable strings of characters.
773 class FlatBuffer
774 super FlatText
775 super Buffer
776
777 redef var chars: Sequence[Char] = new FlatBufferCharView(self) is lazy
778
779 redef var bytes = new FlatBufferByteView(self) is lazy
780
781 private var char_cache: Int = -1
782
783 private var byte_cache: Int = -1
784
785 private var capacity = 0
786
787 # Real items, used as cache for when to_cstring is called
788 private var real_items: NativeString is noinit
789
790 redef fun fast_cstring do return _items.fast_cstring(0)
791
792 redef fun substrings do return new FlatSubstringsIter(self)
793
794 # Re-copies the `NativeString` into a new one and sets it as the new `Buffer`
795 #
796 # This happens when an operation modifies the current `Buffer` and
797 # the Copy-On-Write flag `written` is set at true.
798 private fun reset do
799 var nns = new NativeString(capacity)
800 if _bytelen != 0 then _items.copy_to(nns, _bytelen, 0, 0)
801 _items = nns
802 written = false
803 end
804
805 # Shifts the content of the buffer by `len` bytes to the right, starting at byte `from`
806 #
807 # Internal only, does not modify _bytelen or length, this is the caller's responsability
808 private fun rshift_bytes(from: Int, len: Int) do
809 var oit = _items
810 var nit = _items
811 var bt = _bytelen
812 if bt + len > capacity then
813 capacity = capacity * 2 + 2
814 nit = new NativeString(capacity)
815 oit.copy_to(nit, 0, 0, from)
816 end
817 oit.copy_to(nit, bt - from, from, from + len)
818 end
819
820 # Shifts the content of the buffer by `len` bytes to the left, starting at `from`
821 #
822 # Internal only, does not modify _bytelen or length, this is the caller's responsability
823 private fun lshift_bytes(from: Int, len: Int) do
824 var it = _items
825 it.copy_to(it, _bytelen - from, from, from - len)
826 end
827
828 redef fun []=(index, item)
829 do
830 assert index >= 0 and index <= _length
831 if written then reset
832 is_dirty = true
833 if index == _length then
834 add item
835 return
836 end
837 var it = _items
838 var ip = it.char_to_byte_index(index)
839 var c = it.char_at(ip)
840 var clen = c.u8char_len
841 var itemlen = item.u8char_len
842 var size_diff = itemlen - clen
843 if size_diff > 0 then
844 rshift_bytes(ip + clen, size_diff)
845 else if size_diff < 0 then
846 lshift_bytes(ip + clen, -size_diff)
847 end
848 _bytelen += size_diff
849 it.set_char_at(ip, item)
850 end
851
852 redef fun add(c)
853 do
854 if written then reset
855 is_dirty = true
856 var clen = c.u8char_len
857 var bt = _bytelen
858 enlarge(bt + clen)
859 _items.set_char_at(bt, c)
860 _bytelen += clen
861 _length += 1
862 end
863
864 redef fun clear do
865 is_dirty = true
866 if written then reset
867 _bytelen = 0
868 _length = 0
869 end
870
871 redef fun empty do return new Buffer
872
873 redef fun enlarge(cap)
874 do
875 var c = capacity
876 if cap <= c then return
877 while c <= cap do c = c * 2 + 2
878 # The COW flag can be set at false here, since
879 # it does a copy of the current `Buffer`
880 written = false
881 var bln = _bytelen
882 var a = new NativeString(c+1)
883 if bln > 0 then
884 var it = _items
885 if bln > 0 then it.copy_to(a, bln, 0, 0)
886 end
887 _items = a
888 capacity = c
889 end
890
891 redef fun to_s
892 do
893 written = true
894 var bln = _bytelen
895 if bln == 0 then _items = new NativeString(1)
896 return new FlatString.full(_items, bln, 0, _length)
897 end
898
899 redef fun to_cstring
900 do
901 if is_dirty then
902 var bln = _bytelen
903 var new_native = new NativeString(bln + 1)
904 new_native[bln] = 0u8
905 if _length > 0 then _items.copy_to(new_native, bln, 0, 0)
906 real_items = new_native
907 is_dirty = false
908 end
909 return real_items
910 end
911
912 # Create a new empty string.
913 init do end
914
915 # Low-level creation a new buffer with given data.
916 #
917 # `_items` will be used as is, without copy, to store the characters of the buffer.
918 # Aliasing issues is the responsibility of the caller.
919 #
920 # If `_items` is shared, `written` should be set to true after the creation
921 # so that a modification will do a copy-on-write.
922 private init with_infos(items: NativeString, capacity, bytelen, length: Int)
923 do
924 self._items = items
925 self.capacity = capacity
926 self._bytelen = bytelen
927 self._length = length
928 end
929
930 # Create a new string copied from `s`.
931 init from(s: Text)
932 do
933 _items = new NativeString(s.bytelen)
934 if s isa FlatText then
935 _items = s._items
936 else
937 for i in substrings do i.as(FlatString)._items.copy_to(_items, i._bytelen, 0, 0)
938 end
939 _bytelen = s.bytelen
940 _length = s.length
941 _capacity = _bytelen
942 written = true
943 end
944
945 # Create a new empty string with a given capacity.
946 init with_capacity(cap: Int)
947 do
948 assert cap >= 0
949 _items = new NativeString(cap + 1)
950 capacity = cap
951 _bytelen = 0
952 end
953
954 redef fun append(s)
955 do
956 if s.is_empty then return
957 is_dirty = true
958 var sl = s.bytelen
959 var nln = _bytelen + sl
960 enlarge(nln)
961 if s isa FlatText then
962 s._items.copy_to(_items, sl, s.first_byte, _bytelen)
963 else
964 for i in s.substrings do append i
965 return
966 end
967 _bytelen = nln
968 _length += s.length
969 end
970
971 # Copies the content of self in `dest`
972 fun copy(start: Int, len: Int, dest: Buffer, new_start: Int)
973 do
974 var self_chars = self.chars
975 var dest_chars = dest.chars
976 for i in [0..len-1] do
977 dest_chars[new_start+i] = self_chars[start+i]
978 end
979 end
980
981 redef fun substring(from, count)
982 do
983 assert count >= 0
984 if from < 0 then from = 0
985 if (from + count) > _length then count = _length - from
986 if count <= 0 then return new Buffer
987 var its = _items
988 var bytefrom = its.char_to_byte_index(from)
989 var byteto = its.char_to_byte_index(count + from - 1)
990 byteto += its.char_at(byteto).u8char_len - 1
991 var byte_length = byteto - bytefrom + 1
992 var r_items = new NativeString(byte_length)
993 its.copy_to(r_items, byte_length, bytefrom, 0)
994 return new FlatBuffer.with_infos(r_items, byte_length, byte_length, count)
995 end
996
997 redef fun reverse
998 do
999 written = false
1000 var ns = new FlatBuffer.with_capacity(capacity)
1001 for i in chars.reverse_iterator do ns.add i
1002 _items = ns._items
1003 end
1004
1005 redef fun times(repeats)
1006 do
1007 var bln = _bytelen
1008 var x = new FlatString.full(_items, bln, 0, _length)
1009 for i in [1 .. repeats[ do
1010 append(x)
1011 end
1012 end
1013
1014 redef fun upper
1015 do
1016 if written then reset
1017 for i in [0 .. _length[ do self[i] = self[i].to_upper
1018 end
1019
1020 redef fun lower
1021 do
1022 if written then reset
1023 for i in [0 .. _length[ do self[i] = self[i].to_lower
1024 end
1025 end
1026
1027 private class FlatBufferByteReverseIterator
1028 super IndexedIterator[Byte]
1029
1030 var target: FlatBuffer
1031
1032 var target_items: NativeString is noautoinit
1033
1034 var curr_pos: Int
1035
1036 init do target_items = target._items
1037
1038 redef fun index do return curr_pos
1039
1040 redef fun is_ok do return curr_pos >= 0
1041
1042 redef fun item do return target_items[curr_pos]
1043
1044 redef fun next do curr_pos -= 1
1045
1046 end
1047
1048 private class FlatBufferByteView
1049 super BufferByteView
1050
1051 redef type SELFTYPE: FlatBuffer
1052
1053 redef fun [](index) do return target._items[index]
1054
1055 redef fun iterator_from(pos) do return new FlatBufferByteIterator(target, pos)
1056
1057 redef fun reverse_iterator_from(pos) do return new FlatBufferByteReverseIterator(target, pos)
1058
1059 end
1060
1061 private class FlatBufferByteIterator
1062 super IndexedIterator[Byte]
1063
1064 var target: FlatBuffer
1065
1066 var target_items: NativeString is noautoinit
1067
1068 var curr_pos: Int
1069
1070 init do target_items = target._items
1071
1072 redef fun index do return curr_pos
1073
1074 redef fun is_ok do return curr_pos < target._bytelen
1075
1076 redef fun item do return target_items[curr_pos]
1077
1078 redef fun next do curr_pos += 1
1079
1080 end
1081
1082 private class FlatBufferCharReverseIterator
1083 super IndexedIterator[Char]
1084
1085 var target: FlatBuffer
1086
1087 var curr_pos: Int
1088
1089 redef fun index do return curr_pos
1090
1091 redef fun is_ok do return curr_pos >= 0
1092
1093 redef fun item do return target[curr_pos]
1094
1095 redef fun next do curr_pos -= 1
1096
1097 end
1098
1099 private class FlatBufferCharView
1100 super BufferCharView
1101
1102 redef type SELFTYPE: FlatBuffer
1103
1104 redef fun [](index) do return target[index]
1105
1106 redef fun []=(index, item)
1107 do
1108 assert index >= 0 and index <= length
1109 if index == length then
1110 add(item)
1111 return
1112 end
1113 target[index] = item
1114 end
1115
1116 redef fun push(c)
1117 do
1118 target.add(c)
1119 end
1120
1121 redef fun add(c)
1122 do
1123 target.add(c)
1124 end
1125
1126 fun enlarge(cap: Int)
1127 do
1128 target.enlarge(cap)
1129 end
1130
1131 redef fun append(s)
1132 do
1133 var s_length = s.length
1134 if target.capacity < s.length then enlarge(s_length + target._length)
1135 for i in s do target.add i
1136 end
1137
1138 redef fun iterator_from(pos) do return new FlatBufferCharIterator(target, pos)
1139
1140 redef fun reverse_iterator_from(pos) do return new FlatBufferCharReverseIterator(target, pos)
1141
1142 end
1143
1144 private class FlatBufferCharIterator
1145 super IndexedIterator[Char]
1146
1147 var target: FlatBuffer
1148
1149 var max: Int is noautoinit
1150
1151 var curr_pos: Int
1152
1153 init do max = target._length - 1
1154
1155 redef fun index do return curr_pos
1156
1157 redef fun is_ok do return curr_pos <= max
1158
1159 redef fun item do return target[curr_pos]
1160
1161 redef fun next do curr_pos += 1
1162
1163 end
1164
1165 redef class NativeString
1166 redef fun to_s
1167 do
1168 return to_s_with_length(cstring_length)
1169 end
1170
1171 redef fun to_s_with_length(length)
1172 do
1173 assert length >= 0
1174 return clean_utf8(length)
1175 end
1176
1177 redef fun to_s_full(bytelen, unilen) do
1178 return new FlatString.full(self, bytelen, 0, unilen)
1179 end
1180
1181 redef fun to_s_unsafe(len) do
1182 if len == null then len = cstring_length
1183 return new FlatString.with_infos(self, len, 0)
1184 end
1185
1186 redef fun to_s_with_copy do return to_s_with_copy_and_length(cstring_length)
1187
1188 # Get a `String` from `length` bytes at `self` copied into Nit memory
1189 fun to_s_with_copy_and_length(length: Int): String
1190 do
1191 var r = clean_utf8(length)
1192 if r.items != self then return r
1193 var new_self = new NativeString(length + 1)
1194 copy_to(new_self, length, 0, 0)
1195 var str = new FlatString.with_infos(new_self, length, 0)
1196 new_self[length] = 0u8
1197 str.to_cstring = new_self
1198 return str
1199 end
1200
1201 # Cleans a NativeString if necessary
1202 fun clean_utf8(len: Int): FlatString do
1203 var replacements: nullable Array[Int] = null
1204 var end_length = len
1205 var pos = 0
1206 var chr_ln = 0
1207 var rem = len
1208 while rem > 0 do
1209 while rem >= 4 do
1210 var i = fetch_4_chars(pos)
1211 if i & 0x80808080 != 0 then break
1212 pos += 4
1213 chr_ln += 4
1214 rem -= 4
1215 end
1216 if rem == 0 then break
1217 var b = self[pos]
1218 if b & 0x80u8 == 0x00u8 then
1219 pos += 1
1220 chr_ln += 1
1221 rem -= 1
1222 continue
1223 end
1224 var nxst = length_of_char_at(pos)
1225 var ok_st: Bool
1226 if nxst == 1 then
1227 ok_st = b & 0x80u8 == 0u8
1228 else if nxst == 2 then
1229 ok_st = b & 0xE0u8 == 0xC0u8
1230 else if nxst == 3 then
1231 ok_st = b & 0xF0u8 == 0xE0u8
1232 else
1233 ok_st = b & 0xF8u8 == 0xF0u8
1234 end
1235 if not ok_st then
1236 if replacements == null then replacements = new Array[Int]
1237 replacements.add pos
1238 end_length += 2
1239 pos += 1
1240 rem -= 1
1241 chr_ln += 1
1242 continue
1243 end
1244 var ok_c: Bool
1245 var c = char_at(pos)
1246 var cp = c.code_point
1247 if nxst == 1 then
1248 ok_c = cp >= 0 and cp <= 0x7F
1249 else if nxst == 2 then
1250 ok_c = cp >= 0x80 and cp <= 0x7FF
1251 else if nxst == 3 then
1252 ok_c = cp >= 0x800 and cp <= 0xFFFF
1253 ok_c = ok_c and not (cp >= 0xD800 and cp <= 0xDFFF) and cp != 0xFFFE and cp != 0xFFFF
1254 else
1255 ok_c = cp >= 0x10000 and cp <= 0x10FFFF
1256 end
1257 if not ok_c then
1258 if replacements == null then replacements = new Array[Int]
1259 replacements.add pos
1260 end_length += 2
1261 pos += 1
1262 chr_ln += 1
1263 rem -= 1
1264 continue
1265 end
1266 var clen = c.u8char_len
1267 pos += clen
1268 rem -= clen
1269 chr_ln += 1
1270 end
1271 var ret = self
1272 if end_length != len then
1273 ret = new NativeString(end_length)
1274 var old_repl = 0
1275 var off = 0
1276 var repls = replacements.as(not null)
1277 var r = repls.items.as(not null)
1278 var imax = repls.length
1279 for i in [0 .. imax[ do
1280 var repl_pos = r[i]
1281 var chkln = repl_pos - old_repl
1282 copy_to(ret, chkln, old_repl, off)
1283 off += chkln
1284 ret[off] = 0xEFu8
1285 ret[off + 1] = 0xBFu8
1286 ret[off + 2] = 0xBDu8
1287 old_repl = repl_pos + 1
1288 off += 3
1289 end
1290 copy_to(ret, len - old_repl, old_repl, off)
1291 end
1292 return new FlatString.full(ret, end_length, 0, chr_ln)
1293 end
1294
1295 # Sets the next bytes at position `pos` to the value of `c`, encoded in UTF-8
1296 #
1297 # Very unsafe, make sure to have room for this char prior to calling this function.
1298 private fun set_char_at(pos: Int, c: Char) do
1299 if c.code_point < 128 then
1300 self[pos] = c.code_point.to_b
1301 return
1302 end
1303 var ln = c.u8char_len
1304 native_set_char(pos, c, ln)
1305 end
1306
1307 private fun native_set_char(pos: Int, c: Char, ln: Int) `{
1308 char* dst = self + pos;
1309 switch(ln){
1310 case 1:
1311 dst[0] = c;
1312 break;
1313 case 2:
1314 dst[0] = 0xC0 | ((c & 0x7C0) >> 6);
1315 dst[1] = 0x80 | (c & 0x3F);
1316 break;
1317 case 3:
1318 dst[0] = 0xE0 | ((c & 0xF000) >> 12);
1319 dst[1] = 0x80 | ((c & 0xFC0) >> 6);
1320 dst[2] = 0x80 | (c & 0x3F);
1321 break;
1322 case 4:
1323 dst[0] = 0xF0 | ((c & 0x1C0000) >> 18);
1324 dst[1] = 0x80 | ((c & 0x3F000) >> 12);
1325 dst[2] = 0x80 | ((c & 0xFC0) >> 6);
1326 dst[3] = 0x80 | (c & 0x3F);
1327 break;
1328 }
1329 `}
1330 end
1331
1332 redef class Int
1333 # return displayable int in base 10 and signed
1334 #
1335 # assert 1.to_s == "1"
1336 # assert (-123).to_s == "-123"
1337 redef fun to_s do
1338 # Fast case for common numbers
1339 if self == 0 then return "0"
1340 if self == 1 then return "1"
1341
1342 var nslen = int_to_s_len
1343 var ns = new NativeString(nslen + 1)
1344 ns[nslen] = 0u8
1345 native_int_to_s(ns, nslen + 1)
1346 return new FlatString.full(ns, nslen, 0, nslen)
1347 end
1348 end
1349
1350 redef class Array[E]
1351
1352 # Fast implementation
1353 redef fun plain_to_s
1354 do
1355 var l = _length
1356 if l == 0 then return ""
1357 var its = _items.as(not null)
1358 var first = its[0]
1359 if l == 1 then if first == null then return "" else return first.to_s
1360 var na = new NativeArray[String](l)
1361 var i = 0
1362 var sl = 0
1363 var mypos = 0
1364 while i < l do
1365 var itsi = its[i]
1366 if itsi == null then
1367 i += 1
1368 continue
1369 end
1370 var tmp = itsi.to_s
1371 sl += tmp.bytelen
1372 na[mypos] = tmp
1373 i += 1
1374 mypos += 1
1375 end
1376 var ns = new NativeString(sl + 1)
1377 ns[sl] = 0u8
1378 i = 0
1379 var off = 0
1380 while i < mypos do
1381 var tmp = na[i]
1382 if tmp isa FlatString then
1383 var tpl = tmp._bytelen
1384 tmp._items.copy_to(ns, tpl, tmp._first_byte, off)
1385 off += tpl
1386 else
1387 for j in tmp.substrings do
1388 var s = j.as(FlatString)
1389 var slen = s._bytelen
1390 s._items.copy_to(ns, slen, s._first_byte, off)
1391 off += slen
1392 end
1393 end
1394 i += 1
1395 end
1396 return new FlatString.with_infos(ns, sl, 0)
1397 end
1398 end
1399
1400 redef class NativeArray[E]
1401 redef fun native_to_s do
1402 assert self isa NativeArray[String]
1403 var l = length
1404 var na = self
1405 var i = 0
1406 var sl = 0
1407 var mypos = 0
1408 while i < l do
1409 sl += na[i].bytelen
1410 i += 1
1411 mypos += 1
1412 end
1413 var ns = new NativeString(sl + 1)
1414 ns[sl] = 0u8
1415 i = 0
1416 var off = 0
1417 while i < mypos do
1418 var tmp = na[i]
1419 if tmp isa FlatString then
1420 var tpl = tmp._bytelen
1421 tmp._items.copy_to(ns, tpl, tmp._first_byte, off)
1422 off += tpl
1423 else
1424 for j in tmp.substrings do
1425 var s = j.as(FlatString)
1426 var slen = s._bytelen
1427 s._items.copy_to(ns, slen, s._first_byte, off)
1428 off += slen
1429 end
1430 end
1431 i += 1
1432 end
1433 return new FlatString.with_infos(ns, sl, 0)
1434 end
1435 end
1436
1437 redef class Map[K,V]
1438 redef fun join(sep, couple_sep)
1439 do
1440 if is_empty then return ""
1441
1442 var s = new Buffer # Result
1443
1444 # Concat first item
1445 var i = iterator
1446 var k = i.key
1447 var e = i.item
1448 s.append("{k or else "<null>"}{couple_sep}{e or else "<null>"}")
1449
1450 # Concat other _items
1451 i.next
1452 while i.is_ok do
1453 s.append(sep)
1454 k = i.key
1455 e = i.item
1456 s.append("{k or else "<null>"}{couple_sep}{e or else "<null>"}")
1457 i.next
1458 end
1459 return s.to_s
1460 end
1461 end