lib/core: Replaced hexadecimal values by chars for byte-oriented escapings
[nit.git] / lib / core / text / flat.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # This file is free software, which comes along with NIT. This software is
4 # distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
5 # without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
6 # PARTICULAR PURPOSE. You can modify it is you want, provided this header
7 # is kept unaltered, and a notification of the changes is added.
8 # You are allowed to redistribute it and sell it, alone or is a part of
9 # another product.
10
11 # All the array-based text representations
12 module flat
13
14 intrude import abstract_text
15 intrude import native
16
17 `{
18 #include <stdio.h>
19 #include <string.h>
20 `}
21
22 private class FlatSubstringsIter
23 super Iterator[FlatText]
24
25 var tgt: nullable FlatText
26
27 redef fun item do
28 assert is_ok
29 return tgt.as(not null)
30 end
31
32 redef fun is_ok do return tgt != null
33
34 redef fun next do tgt = null
35 end
36
37 redef class FlatText
38
39 # First byte of the NativeString
40 protected fun first_byte: Int do return 0
41
42 # Last byte of the NativeString
43 protected fun last_byte: Int do return first_byte + _bytelen - 1
44
45 # Cache of the latest position (char) explored in the string
46 var position: Int = 0
47
48 # Cached position (bytes) in the NativeString underlying the String
49 var bytepos: Int = 0
50
51 # Index of the character `index` in `_items`
52 fun char_to_byte_index(index: Int): Int do
53 var dpos = index - _position
54 var b = _bytepos
55 var its = _items
56
57 if dpos == 1 then
58 if its[b] & 0x80u8 == 0x00u8 then
59 b += 1
60 else
61 b += its.length_of_char_at(b)
62 end
63 _bytepos = b
64 _position = index
65 return b
66 end
67 if dpos == -1 then
68 b = its.find_beginning_of_char_at(b - 1)
69 _bytepos = b
70 _position = index
71 return b
72 end
73 if dpos == 0 then return b
74
75 var ln = _length
76 var pos = _position
77 # Find best insertion point
78 var delta_begin = index
79 var delta_end = (ln - 1) - index
80 var delta_cache = (pos - index).abs
81 var min = delta_begin
82
83 if delta_cache < min then min = delta_cache
84 if delta_end < min then min = delta_end
85
86 var ns_i: Int
87 var my_i: Int
88
89 if min == delta_cache then
90 ns_i = _bytepos
91 my_i = pos
92 else if min == delta_begin then
93 ns_i = first_byte
94 my_i = 0
95 else
96 ns_i = its.find_beginning_of_char_at(last_byte)
97 my_i = _length - 1
98 end
99
100 ns_i = its.char_to_byte_index_cached(index, my_i, ns_i)
101
102 _position = index
103 _bytepos = ns_i
104
105 return ns_i
106 end
107
108 # By escaping `self` to HTML, how many more bytes will be needed ?
109 fun chars_to_html_escape: Int do
110 var its = _items
111 var max = last_byte
112 var pos = first_byte
113 var endlen = 0
114 while pos <= max do
115 var c = its[pos]
116 if c == b'<' then
117 endlen += 3
118 else if c == b'>' then
119 endlen += 3
120 else if c == b'&' then
121 endlen += 4
122 else if c == b'"' then
123 endlen += 4
124 else if c == b'\'' then
125 endlen += 4
126 else if c == 0x2Fu8 then
127 endlen += 4
128 end
129 pos += 1
130 end
131 return endlen
132 end
133
134 redef fun html_escape
135 do
136 var extra = chars_to_html_escape
137 if extra == 0 then return to_s
138 var its = _items
139 var max = last_byte
140 var pos = first_byte
141 var nlen = extra + _bytelen
142 var nits = new NativeString(nlen)
143 var outpos = 0
144 while pos <= max do
145 var c = its[pos]
146 # Special codes:
147 # Some HTML characters are used as meta-data, they need
148 # to be replaced by an HTML-Escaped equivalent
149 if c == b'<' then
150 nits[outpos] = b'&'
151 nits[outpos + 1] = b'l'
152 nits[outpos + 2] = b't'
153 nits[outpos + 3] = b';'
154 outpos += 4
155 else if c == b'>' then
156 nits[outpos] = b'&'
157 nits[outpos + 1] = b'g'
158 nits[outpos + 2] = b't'
159 nits[outpos + 3] = b';'
160 outpos += 4
161 else if c == b'&' then
162 nits[outpos] = b'&'
163 nits[outpos + 1] = b'a'
164 nits[outpos + 2] = b'm'
165 nits[outpos + 3] = b'p'
166 nits[outpos + 4] = b';'
167 outpos += 5
168 else if c == b'"' then
169 nits[outpos] = b'&'
170 nits[outpos + 1] = b'#'
171 nits[outpos + 2] = b'3'
172 nits[outpos + 3] = b'4'
173 nits[outpos + 4] = b';'
174 outpos += 5
175 else if c == b'\'' then
176 nits[outpos] = b'&'
177 nits[outpos + 1] = b'#'
178 nits[outpos + 2] = b'3'
179 nits[outpos + 3] = b'9'
180 nits[outpos + 4] = b';'
181 outpos += 5
182 else if c == 0x2Fu8 then
183 nits[outpos] = b'&'
184 nits[outpos + 1] = b'#'
185 nits[outpos + 2] = b'4'
186 nits[outpos + 3] = b'7'
187 nits[outpos + 4] = b';'
188 outpos += 5
189 else
190 nits[outpos] = c
191 outpos += 1
192 end
193 pos += 1
194 end
195 var s = new FlatString.with_infos(nits, nlen, 0)
196 return s
197 end
198
199 # By escaping `self` to C, how many more bytes will be needed ?
200 #
201 # This enables a double-optimization in `escape_to_c` since if this
202 # method returns 0, then `self` does not need escaping and can be
203 # returned as-is
204 fun chars_to_escape_to_c: Int do
205 var its = _items
206 var max = last_byte
207 var pos = first_byte
208 var req_esc = 0
209 while pos <= max do
210 var c = its[pos]
211 if c == b'\n' then
212 req_esc += 1
213 else if c == b'\t' then
214 req_esc += 1
215 else if c == b'"' then
216 req_esc += 1
217 else if c == b'\'' then
218 req_esc += 1
219 else if c == b'\\' then
220 req_esc += 1
221 else if c < 32u8 then
222 req_esc += 3
223 end
224 pos += 1
225 end
226 return req_esc
227 end
228
229 redef fun escape_to_c do
230 var ln_extra = chars_to_escape_to_c
231 if ln_extra == 0 then return self.to_s
232 var its = _items
233 var max = last_byte
234 var nlen = _bytelen + ln_extra
235 var nns = new NativeString(nlen)
236 var pos = first_byte
237 var opos = 0
238 while pos <= max do
239 var c = its[pos]
240 # Special codes:
241 #
242 # Any byte with value < 32 is a control character
243 # All their uses will be replaced by their octal
244 # value in C.
245 #
246 # There are two exceptions however:
247 #
248 # * 0x09 => \t
249 # * 0x0A => \n
250 #
251 # Aside from the code points above, the following are:
252 #
253 # * 0x22 => \"
254 # * 0x27 => \'
255 # * 0x5C => \\
256 if c == b'\t' then
257 nns[opos] = b'\\'
258 nns[opos + 1] = b't'
259 opos += 2
260 else if c == b'\n' then
261 nns[opos] = b'\\'
262 nns[opos + 1] = b'n'
263 opos += 2
264 else if c == b'"' then
265 nns[opos] = b'\\'
266 nns[opos + 1] = b'"'
267 opos += 2
268 else if c == b'\'' then
269 nns[opos] = b'\\'
270 nns[opos + 1] = b'\''
271 opos += 2
272 else if c == b'\\' then
273 nns[opos] = b'\\'
274 nns[opos + 1] = b'\\'
275 opos += 2
276 else if c < 32u8 then
277 nns[opos] = b'\\'
278 nns[opos + 1] = b'0'
279 nns[opos + 2] = ((c & 0x38u8) >> 3) + b'0'
280 nns[opos + 3] = (c & 0x07u8) + b'0'
281 opos += 4
282 else
283 nns[opos] = c
284 opos += 1
285 end
286 pos += 1
287 end
288 return nns.to_s_unsafe(nlen)
289 end
290
291 redef fun [](index) do
292 var len = _length
293
294 # Statistically:
295 # * ~70% want the next char
296 # * ~23% want the previous
297 # * ~7% want the same char
298 #
299 # So it makes sense to shortcut early. And early is here.
300 var dpos = index - _position
301 var b = _bytepos
302 if dpos == 1 and index < len - 1 then
303 var its = _items
304 var c = its[b]
305 if c & 0x80u8 == 0x00u8 then
306 # We want the next, and current is easy.
307 # So next is easy to find!
308 b += 1
309 _position = index
310 _bytepos = b
311 # The rest will be done by `dpos==0` bellow.
312 dpos = 0
313 end
314 else if dpos == -1 and index > 1 then
315 var its = _items
316 var c = its[b-1]
317 if c & 0x80u8 == 0x00u8 then
318 # We want the previous, and it is easy.
319 b -= 1
320 dpos = 0
321 _position = index
322 _bytepos = b
323 return c.ascii
324 end
325 end
326 if dpos == 0 then
327 # We know what we want (+0 or +1) just get it now!
328 var its = _items
329 var c = its[b]
330 if c & 0x80u8 == 0x00u8 then return c.ascii
331 return items.char_at(b)
332 end
333
334 assert index >= 0 and index < len
335 return fetch_char_at(index)
336 end
337
338 # Gets a `Char` at `index` in `self`
339 #
340 # WARNING: Use at your own risks as no bound-checking is done
341 fun fetch_char_at(index: Int): Char do
342 var i = char_to_byte_index(index)
343 var items = _items
344 var b = items[i]
345 if b & 0x80u8 == 0x00u8 then return b.ascii
346 return items.char_at(i)
347 end
348
349 # If `self` contains only digits and alpha <= 'f', return the corresponding integer.
350 #
351 # assert "ff".to_hex == 255
352 redef fun to_hex(pos, ln) do
353 var res = 0
354 if pos == null then pos = 0
355 if ln == null then ln = length - pos
356 pos = char_to_byte_index(pos)
357 var its = _items
358 var max = pos + ln
359 for i in [pos .. max[ do
360 res <<= 4
361 res += its[i].ascii.from_hex
362 end
363 return res
364 end
365 end
366
367 # Immutable strings of characters.
368 abstract class FlatString
369 super FlatText
370 super String
371
372 # Index at which `self` begins in `_items`, inclusively
373 redef var first_byte is noinit
374
375 redef var chars = new FlatStringCharView(self) is lazy
376
377 redef var bytes = new FlatStringByteView(self) is lazy
378
379 redef var to_cstring is lazy do
380 var blen = _bytelen
381 var new_items = new NativeString(blen + 1)
382 _items.copy_to(new_items, blen, _first_byte, 0)
383 new_items[blen] = 0u8
384 return new_items
385 end
386
387 redef fun reversed do
388 var b = new FlatBuffer.with_capacity(_bytelen + 1)
389 var i = _length - 1
390 while i >= 0 do
391 b.add self.fetch_char_at(i)
392 i -= 1
393 end
394 var s = b.to_s.as(FlatString)
395 s._length = self._length
396 return s
397 end
398
399 redef fun fast_cstring do return _items.fast_cstring(_first_byte)
400
401 redef fun substring(from, count)
402 do
403 if count <= 0 then return ""
404
405 if from < 0 then
406 count += from
407 if count < 0 then return ""
408 from = 0
409 end
410
411 var ln = _length
412 if (count + from) > ln then count = ln - from
413 if count <= 0 then return ""
414 var end_index = from + count - 1
415 return substring_impl(from, count, end_index)
416 end
417
418 private fun substring_impl(from, count, end_index: Int): String do
419 var cache = _position
420 var dfrom = (cache - from).abs
421 var dend = (end_index - from).abs
422
423 var bytefrom: Int
424 var byteto: Int
425 if dfrom < dend then
426 bytefrom = char_to_byte_index(from)
427 byteto = char_to_byte_index(end_index)
428 else
429 byteto = char_to_byte_index(end_index)
430 bytefrom = char_to_byte_index(from)
431 end
432
433 var its = _items
434 byteto += its.length_of_char_at(byteto) - 1
435
436 var s = new FlatString.full(its, byteto - bytefrom + 1, bytefrom, count)
437 return s
438 end
439
440 redef fun empty do return "".as(FlatString)
441
442 redef fun to_upper
443 do
444 var outstr = new FlatBuffer.with_capacity(self._bytelen + 1)
445
446 var mylen = _length
447 var pos = 0
448
449 while pos < mylen do
450 outstr.add(chars[pos].to_upper)
451 pos += 1
452 end
453
454 return outstr.to_s
455 end
456
457 redef fun to_lower
458 do
459 var outstr = new FlatBuffer.with_capacity(self._bytelen + 1)
460
461 var mylen = _length
462 var pos = 0
463
464 while pos < mylen do
465 outstr.add(chars[pos].to_lower)
466 pos += 1
467 end
468
469 return outstr.to_s
470 end
471
472 redef fun output
473 do
474 for i in chars do i.output
475 end
476
477 ##################################################
478 # String Specific Methods #
479 ##################################################
480
481 # Low-level creation of a new string with minimal data.
482 #
483 # `_items` will be used as is, without copy, to retrieve the characters of the string.
484 # Aliasing issues is the responsibility of the caller.
485 private new with_infos(items: NativeString, bytelen, from: Int)
486 do
487 var len = items.utf8_length(from, bytelen)
488 if bytelen == len then return new ASCIIFlatString.full_data(items, bytelen, from, len)
489 return new UnicodeFlatString.full_data(items, bytelen, from, len)
490 end
491
492 # Low-level creation of a new string with all the data.
493 #
494 # `_items` will be used as is, without copy, to retrieve the characters of the string.
495 # Aliasing issues is the responsibility of the caller.
496 private new full(items: NativeString, bytelen, from, length: Int)
497 do
498 if bytelen == length then return new ASCIIFlatString.full_data(items, bytelen, from, length)
499 return new UnicodeFlatString.full_data(items, bytelen, from, length)
500 end
501
502 redef fun ==(other)
503 do
504 if not other isa FlatText then return super
505
506 if self.object_id == other.object_id then return true
507
508 var my_length = _bytelen
509
510 if other._bytelen != my_length then return false
511
512 var my_index = _first_byte
513 var its_index = other.first_byte
514
515 var last_iteration = my_index + my_length
516
517 var its_items = other._items
518 var my_items = self._items
519
520 while my_index < last_iteration do
521 if my_items[my_index] != its_items[its_index] then return false
522 my_index += 1
523 its_index += 1
524 end
525
526 return true
527 end
528
529 redef fun <(other)
530 do
531 if not other isa FlatText then return super
532
533 if self.object_id == other.object_id then return false
534
535 var myits = _items
536 var itsits = other._items
537
538 var mbt = _bytelen
539 var obt = other.bytelen
540
541 var minln = if mbt < obt then mbt else obt
542 var mst = _first_byte
543 var ost = other.first_byte
544
545 for i in [0 .. minln[ do
546 var my_curr_char = myits[mst]
547 var its_curr_char = itsits[ost]
548
549 if my_curr_char > its_curr_char then return false
550 if my_curr_char < its_curr_char then return true
551
552 mst += 1
553 ost += 1
554 end
555
556 return mbt < obt
557 end
558
559 redef fun +(o) do
560 var s = o.to_s
561 var slen = s.bytelen
562 var mlen = _bytelen
563 var nlen = mlen + slen
564 var mits = _items
565 var mifrom = _first_byte
566 if s isa FlatText then
567 var sits = s._items
568 var sifrom = s.first_byte
569 var ns = new NativeString(nlen + 1)
570 mits.copy_to(ns, mlen, mifrom, 0)
571 sits.copy_to(ns, slen, sifrom, mlen)
572 return new FlatString.full(ns, nlen, 0, _length + o.length)
573 else
574 abort
575 end
576 end
577
578 redef fun *(i) do
579 var mybtlen = _bytelen
580 var new_bytelen = mybtlen * i
581 var mylen = _length
582 var newlen = mylen * i
583 var its = _items
584 var fb = _first_byte
585 var ns = new NativeString(new_bytelen + 1)
586 ns[new_bytelen] = 0u8
587 var offset = 0
588 while i > 0 do
589 its.copy_to(ns, mybtlen, fb, offset)
590 offset += mybtlen
591 i -= 1
592 end
593 return new FlatString.full(ns, new_bytelen, 0, newlen)
594 end
595
596 redef fun hash
597 do
598 if hash_cache == null then
599 # djb2 hash algorithm
600 var h = 5381
601 var i = _first_byte
602
603 var my_items = _items
604 var max = last_byte
605
606 while i <= max do
607 h = (h << 5) + h + my_items[i].to_i
608 i += 1
609 end
610
611 hash_cache = h
612 end
613
614 return hash_cache.as(not null)
615 end
616
617 redef fun substrings do return new FlatSubstringsIter(self)
618 end
619
620 # Regular Nit UTF-8 strings
621 private class UnicodeFlatString
622 super FlatString
623
624 init full_data(items: NativeString, bytelen, from, length: Int) do
625 self._items = items
626 self._length = length
627 self._bytelen = bytelen
628 _first_byte = from
629 _bytepos = from
630 end
631
632 redef fun substring_from(from) do
633 if from >= self._length then return empty
634 if from <= 0 then return self
635 var c = char_to_byte_index(from)
636 var st = c - _first_byte
637 var fln = bytelen - st
638 return new FlatString.full(items, fln, c, _length - from)
639 end
640 end
641
642 # Special cases of String where all the characters are ASCII-based
643 #
644 # Optimizes access operations to O(1) complexity.
645 private class ASCIIFlatString
646 super FlatString
647
648 init full_data(items: NativeString, bytelen, from, length: Int) do
649 self._items = items
650 self._length = length
651 self._bytelen = bytelen
652 _first_byte = from
653 _bytepos = from
654 end
655
656 redef fun [](idx) do
657 assert idx < _bytelen and idx >= 0
658 return _items[idx + _first_byte].ascii
659 end
660
661 redef fun substring(from, count) do
662 if count <= 0 then return ""
663
664 if from < 0 then
665 count += from
666 if count < 0 then return ""
667 from = 0
668 end
669 var ln = _length
670 if (count + from) > ln then count = ln - from
671 return new ASCIIFlatString.full_data(_items, count, from + _first_byte, count)
672 end
673
674 redef fun reversed do
675 var b = new FlatBuffer.with_capacity(_bytelen + 1)
676 var i = _length - 1
677 while i >= 0 do
678 b.add self[i]
679 i -= 1
680 end
681 var s = b.to_s.as(FlatString)
682 return s
683 end
684
685 redef fun char_to_byte_index(index) do return index + _first_byte
686
687 redef fun substring_impl(from, count, end_index) do
688 return new ASCIIFlatString.full_data(_items, count, from + _first_byte, count)
689 end
690
691 redef fun fetch_char_at(i) do return _items[i + _first_byte].ascii
692 end
693
694 private class FlatStringCharReverseIterator
695 super IndexedIterator[Char]
696
697 var target: FlatString
698
699 var curr_pos: Int
700
701 redef fun is_ok do return curr_pos >= 0
702
703 redef fun item do return target[curr_pos]
704
705 redef fun next do curr_pos -= 1
706
707 redef fun index do return curr_pos
708
709 end
710
711 private class FlatStringCharIterator
712 super IndexedIterator[Char]
713
714 var target: FlatString
715
716 var max: Int is noautoinit
717
718 var curr_pos: Int
719
720 init do max = target._length - 1
721
722 redef fun is_ok do return curr_pos <= max
723
724 redef fun item do return target[curr_pos]
725
726 redef fun next do curr_pos += 1
727
728 redef fun index do return curr_pos
729
730 end
731
732 private class FlatStringCharView
733 super StringCharView
734
735 redef type SELFTYPE: FlatString
736
737 redef fun [](index) do return target[index]
738
739 redef fun iterator_from(start) do return new FlatStringCharIterator(target, start)
740
741 redef fun reverse_iterator_from(start) do return new FlatStringCharReverseIterator(target, start)
742
743 end
744
745 private class FlatStringByteReverseIterator
746 super IndexedIterator[Byte]
747
748 var target: FlatString
749
750 var target_items: NativeString is noautoinit
751
752 var curr_pos: Int
753
754 init
755 do
756 var tgt = target
757 target_items = tgt._items
758 curr_pos += tgt._first_byte
759 end
760
761 redef fun is_ok do return curr_pos >= target._first_byte
762
763 redef fun item do return target_items[curr_pos]
764
765 redef fun next do curr_pos -= 1
766
767 redef fun index do return curr_pos - target._first_byte
768
769 end
770
771 private class FlatStringByteIterator
772 super IndexedIterator[Byte]
773
774 var target: FlatString
775
776 var target_items: NativeString is noautoinit
777
778 var curr_pos: Int
779
780 init
781 do
782 var tgt = target
783 target_items = tgt._items
784 curr_pos += tgt._first_byte
785 end
786
787 redef fun is_ok do return curr_pos <= target.last_byte
788
789 redef fun item do return target_items[curr_pos]
790
791 redef fun next do curr_pos += 1
792
793 redef fun index do return curr_pos - target._first_byte
794
795 end
796
797 private class FlatStringByteView
798 super StringByteView
799
800 redef type SELFTYPE: FlatString
801
802 redef fun [](index)
803 do
804 # Check that the index (+ _first_byte) is not larger than last_byte
805 # In other terms, if the index is valid
806 var target = _target
807 assert index >= 0 and index < target._bytelen
808 var ind = index + target._first_byte
809 return target._items[ind]
810 end
811
812 redef fun iterator_from(start) do return new FlatStringByteIterator(target, start)
813
814 redef fun reverse_iterator_from(start) do return new FlatStringByteReverseIterator(target, start)
815
816 end
817
818 redef class Buffer
819 redef new do return new FlatBuffer
820
821 redef new with_cap(i) do return new FlatBuffer.with_capacity(i)
822 end
823
824 # Mutable strings of characters.
825 class FlatBuffer
826 super FlatText
827 super Buffer
828
829 redef var chars: Sequence[Char] = new FlatBufferCharView(self) is lazy
830
831 redef var bytes = new FlatBufferByteView(self) is lazy
832
833 private var char_cache: Int = -1
834
835 private var byte_cache: Int = -1
836
837 private var capacity = 0
838
839 # Real items, used as cache for when to_cstring is called
840 private var real_items: NativeString is noinit
841
842 redef fun fast_cstring do return _items.fast_cstring(0)
843
844 redef fun substrings do return new FlatSubstringsIter(self)
845
846 # Re-copies the `NativeString` into a new one and sets it as the new `Buffer`
847 #
848 # This happens when an operation modifies the current `Buffer` and
849 # the Copy-On-Write flag `written` is set at true.
850 private fun reset do
851 var nns = new NativeString(capacity)
852 if _bytelen != 0 then _items.copy_to(nns, _bytelen, 0, 0)
853 _items = nns
854 written = false
855 end
856
857 # Shifts the content of the buffer by `len` bytes to the right, starting at byte `from`
858 #
859 # Internal only, does not modify _bytelen or length, this is the caller's responsability
860 private fun rshift_bytes(from: Int, len: Int) do
861 var oit = _items
862 var nit = _items
863 var bt = _bytelen
864 if bt + len > capacity then
865 capacity = capacity * 2 + 2
866 nit = new NativeString(capacity)
867 oit.copy_to(nit, 0, 0, from)
868 end
869 oit.copy_to(nit, bt - from, from, from + len)
870 end
871
872 # Shifts the content of the buffer by `len` bytes to the left, starting at `from`
873 #
874 # Internal only, does not modify _bytelen or length, this is the caller's responsability
875 private fun lshift_bytes(from: Int, len: Int) do
876 var it = _items
877 it.copy_to(it, _bytelen - from, from, from - len)
878 end
879
880 redef fun []=(index, item)
881 do
882 assert index >= 0 and index <= _length
883 if written then reset
884 is_dirty = true
885 if index == _length then
886 add item
887 return
888 end
889 var it = _items
890 var ip = it.char_to_byte_index(index)
891 var c = it.char_at(ip)
892 var clen = c.u8char_len
893 var itemlen = item.u8char_len
894 var size_diff = itemlen - clen
895 if size_diff > 0 then
896 rshift_bytes(ip + clen, size_diff)
897 else if size_diff < 0 then
898 lshift_bytes(ip + clen, -size_diff)
899 end
900 _bytelen += size_diff
901 it.set_char_at(ip, item)
902 end
903
904 redef fun add(c)
905 do
906 if written then reset
907 is_dirty = true
908 var clen = c.u8char_len
909 var bt = _bytelen
910 enlarge(bt + clen)
911 _items.set_char_at(bt, c)
912 _bytelen += clen
913 _length += 1
914 end
915
916 redef fun clear do
917 is_dirty = true
918 _bytelen = 0
919 _length = 0
920 if written then reset
921 end
922
923 redef fun empty do return new Buffer
924
925 redef fun enlarge(cap)
926 do
927 var c = capacity
928 if cap <= c then return
929 if c <= 16 then c = 16
930 while c <= cap do c = c * 2
931 # The COW flag can be set at false here, since
932 # it does a copy of the current `Buffer`
933 written = false
934 var bln = _bytelen
935 var a = new NativeString(c)
936 if bln > 0 then
937 var it = _items
938 if bln > 0 then it.copy_to(a, bln, 0, 0)
939 end
940 _items = a
941 capacity = c
942 end
943
944 redef fun to_s
945 do
946 written = true
947 var bln = _bytelen
948 if bln == 0 then _items = new NativeString(1)
949 return new FlatString.full(_items, bln, 0, _length)
950 end
951
952 redef fun to_cstring
953 do
954 if is_dirty then
955 var bln = _bytelen
956 var new_native = new NativeString(bln + 1)
957 new_native[bln] = 0u8
958 if _length > 0 then _items.copy_to(new_native, bln, 0, 0)
959 real_items = new_native
960 is_dirty = false
961 end
962 return real_items
963 end
964
965 # Create a new empty string.
966 init do end
967
968 # Low-level creation a new buffer with given data.
969 #
970 # `_items` will be used as is, without copy, to store the characters of the buffer.
971 # Aliasing issues is the responsibility of the caller.
972 #
973 # If `_items` is shared, `written` should be set to true after the creation
974 # so that a modification will do a copy-on-write.
975 private init with_infos(items: NativeString, capacity, bytelen, length: Int)
976 do
977 self._items = items
978 self.capacity = capacity
979 self._bytelen = bytelen
980 self._length = length
981 end
982
983 # Create a new string copied from `s`.
984 init from(s: Text)
985 do
986 _items = new NativeString(s.bytelen)
987 if s isa FlatText then
988 _items = s._items
989 else
990 for i in substrings do i.as(FlatString)._items.copy_to(_items, i._bytelen, 0, 0)
991 end
992 _bytelen = s.bytelen
993 _length = s.length
994 _capacity = _bytelen
995 written = true
996 end
997
998 # Create a new empty string with a given capacity.
999 init with_capacity(cap: Int)
1000 do
1001 assert cap >= 0
1002 _items = new NativeString(cap)
1003 capacity = cap
1004 _bytelen = 0
1005 end
1006
1007 redef fun append(s)
1008 do
1009 if s.is_empty then return
1010 is_dirty = true
1011 var sl = s.bytelen
1012 var nln = _bytelen + sl
1013 enlarge(nln)
1014 if s isa FlatText then
1015 s._items.copy_to(_items, sl, s.first_byte, _bytelen)
1016 else
1017 for i in s.substrings do append i
1018 return
1019 end
1020 _bytelen = nln
1021 _length += s.length
1022 end
1023
1024 # Copies the content of self in `dest`
1025 fun copy(start: Int, len: Int, dest: Buffer, new_start: Int)
1026 do
1027 var self_chars = self.chars
1028 var dest_chars = dest.chars
1029 for i in [0..len-1] do
1030 dest_chars[new_start+i] = self_chars[start+i]
1031 end
1032 end
1033
1034 redef fun substring(from, count)
1035 do
1036 assert count >= 0
1037 if from < 0 then from = 0
1038 if (from + count) > _length then count = _length - from
1039 if count <= 0 then return new Buffer
1040 var its = _items
1041 var bytefrom = its.char_to_byte_index(from)
1042 var byteto = its.char_to_byte_index(count + from - 1)
1043 byteto += its.char_at(byteto).u8char_len - 1
1044 var byte_length = byteto - bytefrom + 1
1045 var r_items = new NativeString(byte_length)
1046 its.copy_to(r_items, byte_length, bytefrom, 0)
1047 return new FlatBuffer.with_infos(r_items, byte_length, byte_length, count)
1048 end
1049
1050 redef fun reverse
1051 do
1052 written = false
1053 var ns = new FlatBuffer.with_capacity(capacity)
1054 for i in chars.reverse_iterator do ns.add i
1055 _items = ns._items
1056 end
1057
1058 redef fun times(repeats)
1059 do
1060 var bln = _bytelen
1061 var x = new FlatString.full(_items, bln, 0, _length)
1062 for i in [1 .. repeats[ do
1063 append(x)
1064 end
1065 end
1066
1067 redef fun upper
1068 do
1069 if written then reset
1070 for i in [0 .. _length[ do self[i] = self[i].to_upper
1071 end
1072
1073 redef fun lower
1074 do
1075 if written then reset
1076 for i in [0 .. _length[ do self[i] = self[i].to_lower
1077 end
1078 end
1079
1080 private class FlatBufferByteReverseIterator
1081 super IndexedIterator[Byte]
1082
1083 var target: FlatBuffer
1084
1085 var target_items: NativeString is noautoinit
1086
1087 var curr_pos: Int
1088
1089 init do target_items = target._items
1090
1091 redef fun index do return curr_pos
1092
1093 redef fun is_ok do return curr_pos >= 0
1094
1095 redef fun item do return target_items[curr_pos]
1096
1097 redef fun next do curr_pos -= 1
1098
1099 end
1100
1101 private class FlatBufferByteView
1102 super BufferByteView
1103
1104 redef type SELFTYPE: FlatBuffer
1105
1106 redef fun [](index) do return target._items[index]
1107
1108 redef fun iterator_from(pos) do return new FlatBufferByteIterator(target, pos)
1109
1110 redef fun reverse_iterator_from(pos) do return new FlatBufferByteReverseIterator(target, pos)
1111
1112 end
1113
1114 private class FlatBufferByteIterator
1115 super IndexedIterator[Byte]
1116
1117 var target: FlatBuffer
1118
1119 var target_items: NativeString is noautoinit
1120
1121 var curr_pos: Int
1122
1123 init do target_items = target._items
1124
1125 redef fun index do return curr_pos
1126
1127 redef fun is_ok do return curr_pos < target._bytelen
1128
1129 redef fun item do return target_items[curr_pos]
1130
1131 redef fun next do curr_pos += 1
1132
1133 end
1134
1135 private class FlatBufferCharReverseIterator
1136 super IndexedIterator[Char]
1137
1138 var target: FlatBuffer
1139
1140 var curr_pos: Int
1141
1142 redef fun index do return curr_pos
1143
1144 redef fun is_ok do return curr_pos >= 0
1145
1146 redef fun item do return target[curr_pos]
1147
1148 redef fun next do curr_pos -= 1
1149
1150 end
1151
1152 private class FlatBufferCharView
1153 super BufferCharView
1154
1155 redef type SELFTYPE: FlatBuffer
1156
1157 redef fun [](index) do return target[index]
1158
1159 redef fun []=(index, item)
1160 do
1161 assert index >= 0 and index <= length
1162 if index == length then
1163 add(item)
1164 return
1165 end
1166 target[index] = item
1167 end
1168
1169 redef fun push(c)
1170 do
1171 target.add(c)
1172 end
1173
1174 redef fun add(c)
1175 do
1176 target.add(c)
1177 end
1178
1179 fun enlarge(cap: Int)
1180 do
1181 target.enlarge(cap)
1182 end
1183
1184 redef fun append(s)
1185 do
1186 var s_length = s.length
1187 if target.capacity < s.length then enlarge(s_length + target._length)
1188 for i in s do target.add i
1189 end
1190
1191 redef fun iterator_from(pos) do return new FlatBufferCharIterator(target, pos)
1192
1193 redef fun reverse_iterator_from(pos) do return new FlatBufferCharReverseIterator(target, pos)
1194
1195 end
1196
1197 private class FlatBufferCharIterator
1198 super IndexedIterator[Char]
1199
1200 var target: FlatBuffer
1201
1202 var max: Int is noautoinit
1203
1204 var curr_pos: Int
1205
1206 init do max = target._length - 1
1207
1208 redef fun index do return curr_pos
1209
1210 redef fun is_ok do return curr_pos <= max
1211
1212 redef fun item do return target[curr_pos]
1213
1214 redef fun next do curr_pos += 1
1215
1216 end
1217
1218 redef class NativeString
1219 redef fun to_s
1220 do
1221 return to_s_with_length(cstring_length)
1222 end
1223
1224 redef fun to_s_with_length(length)
1225 do
1226 assert length >= 0
1227 return clean_utf8(length)
1228 end
1229
1230 redef fun to_s_full(bytelen, unilen) do
1231 return new FlatString.full(self, bytelen, 0, unilen)
1232 end
1233
1234 redef fun to_s_unsafe(len) do
1235 if len == null then len = cstring_length
1236 return new FlatString.with_infos(self, len, 0)
1237 end
1238
1239 redef fun to_s_with_copy do return to_s_with_copy_and_length(cstring_length)
1240
1241 # Get a `String` from `length` bytes at `self` copied into Nit memory
1242 fun to_s_with_copy_and_length(length: Int): String
1243 do
1244 var r = clean_utf8(length)
1245 if r.items != self then return r
1246 var new_self = new NativeString(length + 1)
1247 copy_to(new_self, length, 0, 0)
1248 var str = new FlatString.with_infos(new_self, length, 0)
1249 new_self[length] = 0u8
1250 str.to_cstring = new_self
1251 return str
1252 end
1253
1254 # Cleans a NativeString if necessary
1255 fun clean_utf8(len: Int): FlatString do
1256 var replacements: nullable Array[Int] = null
1257 var end_length = len
1258 var pos = 0
1259 var chr_ln = 0
1260 var rem = len
1261 while rem > 0 do
1262 while rem >= 4 do
1263 var i = fetch_4_chars(pos)
1264 if i & 0x80808080 != 0 then break
1265 pos += 4
1266 chr_ln += 4
1267 rem -= 4
1268 end
1269 if rem == 0 then break
1270 var b = self[pos]
1271 if b & 0x80u8 == 0x00u8 then
1272 pos += 1
1273 chr_ln += 1
1274 rem -= 1
1275 continue
1276 end
1277 var nxst = length_of_char_at(pos)
1278 var ok_st: Bool
1279 if nxst == 1 then
1280 ok_st = b & 0x80u8 == 0u8
1281 else if nxst == 2 then
1282 ok_st = b & 0xE0u8 == 0xC0u8
1283 else if nxst == 3 then
1284 ok_st = b & 0xF0u8 == 0xE0u8
1285 else
1286 ok_st = b & 0xF8u8 == 0xF0u8
1287 end
1288 if not ok_st then
1289 if replacements == null then replacements = new Array[Int]
1290 replacements.add pos
1291 end_length += 2
1292 pos += 1
1293 rem -= 1
1294 chr_ln += 1
1295 continue
1296 end
1297 var ok_c: Bool
1298 var c = char_at(pos)
1299 var cp = c.code_point
1300 if nxst == 1 then
1301 ok_c = cp >= 0 and cp <= 0x7F
1302 else if nxst == 2 then
1303 ok_c = cp >= 0x80 and cp <= 0x7FF
1304 else if nxst == 3 then
1305 ok_c = cp >= 0x800 and cp <= 0xFFFF
1306 ok_c = ok_c and not (cp >= 0xD800 and cp <= 0xDFFF) and cp != 0xFFFE and cp != 0xFFFF
1307 else
1308 ok_c = cp >= 0x10000 and cp <= 0x10FFFF
1309 end
1310 if not ok_c then
1311 if replacements == null then replacements = new Array[Int]
1312 replacements.add pos
1313 end_length += 2
1314 pos += 1
1315 chr_ln += 1
1316 rem -= 1
1317 continue
1318 end
1319 var clen = c.u8char_len
1320 pos += clen
1321 rem -= clen
1322 chr_ln += 1
1323 end
1324 var ret = self
1325 if end_length != len then
1326 ret = new NativeString(end_length)
1327 var old_repl = 0
1328 var off = 0
1329 var repls = replacements.as(not null)
1330 var r = repls.items.as(not null)
1331 var imax = repls.length
1332 for i in [0 .. imax[ do
1333 var repl_pos = r[i]
1334 var chkln = repl_pos - old_repl
1335 copy_to(ret, chkln, old_repl, off)
1336 off += chkln
1337 ret[off] = 0xEFu8
1338 ret[off + 1] = 0xBFu8
1339 ret[off + 2] = 0xBDu8
1340 old_repl = repl_pos + 1
1341 off += 3
1342 end
1343 copy_to(ret, len - old_repl, old_repl, off)
1344 end
1345 return new FlatString.full(ret, end_length, 0, chr_ln)
1346 end
1347
1348 # Sets the next bytes at position `pos` to the value of `c`, encoded in UTF-8
1349 #
1350 # Very unsafe, make sure to have room for this char prior to calling this function.
1351 private fun set_char_at(pos: Int, c: Char) do
1352 if c.code_point < 128 then
1353 self[pos] = c.code_point.to_b
1354 return
1355 end
1356 var ln = c.u8char_len
1357 native_set_char(pos, c, ln)
1358 end
1359
1360 private fun native_set_char(pos: Int, c: Char, ln: Int) `{
1361 char* dst = self + pos;
1362 switch(ln){
1363 case 1:
1364 dst[0] = c;
1365 break;
1366 case 2:
1367 dst[0] = 0xC0 | ((c & 0x7C0) >> 6);
1368 dst[1] = 0x80 | (c & 0x3F);
1369 break;
1370 case 3:
1371 dst[0] = 0xE0 | ((c & 0xF000) >> 12);
1372 dst[1] = 0x80 | ((c & 0xFC0) >> 6);
1373 dst[2] = 0x80 | (c & 0x3F);
1374 break;
1375 case 4:
1376 dst[0] = 0xF0 | ((c & 0x1C0000) >> 18);
1377 dst[1] = 0x80 | ((c & 0x3F000) >> 12);
1378 dst[2] = 0x80 | ((c & 0xFC0) >> 6);
1379 dst[3] = 0x80 | (c & 0x3F);
1380 break;
1381 }
1382 `}
1383 end
1384
1385 redef class Int
1386 # return displayable int in base 10 and signed
1387 #
1388 # assert 1.to_s == "1"
1389 # assert (-123).to_s == "-123"
1390 redef fun to_s do
1391 # Fast case for common numbers
1392 if self == 0 then return "0"
1393 if self == 1 then return "1"
1394
1395 var nslen = int_to_s_len
1396 var ns = new NativeString(nslen + 1)
1397 ns[nslen] = 0u8
1398 native_int_to_s(ns, nslen + 1)
1399 return new FlatString.full(ns, nslen, 0, nslen)
1400 end
1401 end
1402
1403 redef class Array[E]
1404
1405 # Fast implementation
1406 redef fun plain_to_s
1407 do
1408 var l = _length
1409 if l == 0 then return ""
1410 var its = _items.as(not null)
1411 var first = its[0]
1412 if l == 1 then if first == null then return "" else return first.to_s
1413 var na = new NativeArray[String](l)
1414 var i = 0
1415 var sl = 0
1416 var mypos = 0
1417 while i < l do
1418 var itsi = its[i]
1419 if itsi == null then
1420 i += 1
1421 continue
1422 end
1423 var tmp = itsi.to_s
1424 sl += tmp.bytelen
1425 na[mypos] = tmp
1426 i += 1
1427 mypos += 1
1428 end
1429 var ns = new NativeString(sl + 1)
1430 ns[sl] = 0u8
1431 i = 0
1432 var off = 0
1433 while i < mypos do
1434 var tmp = na[i]
1435 if tmp isa FlatString then
1436 var tpl = tmp._bytelen
1437 tmp._items.copy_to(ns, tpl, tmp._first_byte, off)
1438 off += tpl
1439 else
1440 for j in tmp.substrings do
1441 var s = j.as(FlatString)
1442 var slen = s._bytelen
1443 s._items.copy_to(ns, slen, s._first_byte, off)
1444 off += slen
1445 end
1446 end
1447 i += 1
1448 end
1449 return new FlatString.with_infos(ns, sl, 0)
1450 end
1451 end
1452
1453 redef class NativeArray[E]
1454 redef fun native_to_s do
1455 assert self isa NativeArray[String]
1456 var l = length
1457 var na = self
1458 var i = 0
1459 var sl = 0
1460 var mypos = 0
1461 while i < l do
1462 sl += na[i].bytelen
1463 i += 1
1464 mypos += 1
1465 end
1466 var ns = new NativeString(sl + 1)
1467 ns[sl] = 0u8
1468 i = 0
1469 var off = 0
1470 while i < mypos do
1471 var tmp = na[i]
1472 if tmp isa FlatString then
1473 var tpl = tmp._bytelen
1474 tmp._items.copy_to(ns, tpl, tmp._first_byte, off)
1475 off += tpl
1476 else
1477 for j in tmp.substrings do
1478 var s = j.as(FlatString)
1479 var slen = s._bytelen
1480 s._items.copy_to(ns, slen, s._first_byte, off)
1481 off += slen
1482 end
1483 end
1484 i += 1
1485 end
1486 return new FlatString.with_infos(ns, sl, 0)
1487 end
1488 end
1489
1490 redef class Map[K,V]
1491 redef fun join(sep, couple_sep)
1492 do
1493 if is_empty then return ""
1494
1495 var s = new Buffer # Result
1496
1497 # Concat first item
1498 var i = iterator
1499 var k = i.key
1500 var e = i.item
1501 s.append("{k or else "<null>"}{couple_sep}{e or else "<null>"}")
1502
1503 # Concat other _items
1504 i.next
1505 while i.is_ok do
1506 s.append(sep)
1507 k = i.key
1508 e = i.item
1509 s.append("{k or else "<null>"}{couple_sep}{e or else "<null>"}")
1510 i.next
1511 end
1512 return s.to_s
1513 end
1514 end