Merge: Use prefixed chars instead of raw values
[nit.git] / lib / core / text / flat.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # This file is free software, which comes along with NIT. This software is
4 # distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
5 # without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
6 # PARTICULAR PURPOSE. You can modify it is you want, provided this header
7 # is kept unaltered, and a notification of the changes is added.
8 # You are allowed to redistribute it and sell it, alone or is a part of
9 # another product.
10
11 # All the array-based text representations
12 module flat
13
14 intrude import abstract_text
15 intrude import native
16
17 `{
18 #include <stdio.h>
19 #include <string.h>
20 `}
21
22 private class FlatSubstringsIter
23 super Iterator[FlatText]
24
25 var tgt: nullable FlatText
26
27 redef fun item do
28 assert is_ok
29 return tgt.as(not null)
30 end
31
32 redef fun is_ok do return tgt != null
33
34 redef fun next do tgt = null
35 end
36
37 redef class FlatText
38
39 # First byte of the CString
40 protected fun first_byte: Int do return 0
41
42 # Last byte of the CString
43 protected fun last_byte: Int do return first_byte + _byte_length - 1
44
45 # Cache of the latest position (char) explored in the string
46 var position: Int = 0
47
48 # Cached position (bytes) in the CString underlying the String
49 var bytepos: Int = 0
50
51 # Index of the character `index` in `_items`
52 fun char_to_byte_index(index: Int): Int do
53 var dpos = index - _position
54 var b = _bytepos
55 var its = _items
56
57 if dpos == 1 then
58 if its[b] & 0x80u8 == 0x00u8 then
59 b += 1
60 else
61 b += its.length_of_char_at(b)
62 end
63 _bytepos = b
64 _position = index
65 return b
66 end
67 if dpos == -1 then
68 b = its.find_beginning_of_char_at(b - 1)
69 _bytepos = b
70 _position = index
71 return b
72 end
73 if dpos == 0 then return b
74
75 var ln = _length
76 var pos = _position
77 # Find best insertion point
78 var delta_begin = index
79 var delta_end = (ln - 1) - index
80 var delta_cache = (pos - index).abs
81 var min = delta_begin
82
83 if delta_cache < min then min = delta_cache
84 if delta_end < min then min = delta_end
85
86 var ns_i: Int
87 var my_i: Int
88
89 if min == delta_cache then
90 ns_i = _bytepos
91 my_i = pos
92 else if min == delta_begin then
93 ns_i = first_byte
94 my_i = 0
95 else
96 ns_i = its.find_beginning_of_char_at(last_byte)
97 my_i = _length - 1
98 end
99
100 ns_i = its.char_to_byte_index_cached(index, my_i, ns_i)
101
102 _position = index
103 _bytepos = ns_i
104
105 return ns_i
106 end
107
108 # By escaping `self` to HTML, how many more bytes will be needed ?
109 fun chars_to_html_escape: Int do
110 var its = _items
111 var max = last_byte
112 var pos = first_byte
113 var endlen = 0
114 while pos <= max do
115 var c = its[pos]
116 if c == b'<' then
117 endlen += 3
118 else if c == b'>' then
119 endlen += 3
120 else if c == b'&' then
121 endlen += 4
122 else if c == b'"' then
123 endlen += 4
124 else if c == b'\'' then
125 endlen += 4
126 else if c == 0x2Fu8 then
127 endlen += 4
128 end
129 pos += 1
130 end
131 return endlen
132 end
133
134 redef fun html_escape
135 do
136 var extra = chars_to_html_escape
137 if extra == 0 then return to_s
138 var its = _items
139 var max = last_byte
140 var pos = first_byte
141 var nlen = extra + _byte_length
142 var nits = new CString(nlen)
143 var outpos = 0
144 while pos <= max do
145 var c = its[pos]
146 # Special codes:
147 # Some HTML characters are used as meta-data, they need
148 # to be replaced by an HTML-Escaped equivalent
149 if c == b'<' then
150 nits[outpos] = b'&'
151 nits[outpos + 1] = b'l'
152 nits[outpos + 2] = b't'
153 nits[outpos + 3] = b';'
154 outpos += 4
155 else if c == b'>' then
156 nits[outpos] = b'&'
157 nits[outpos + 1] = b'g'
158 nits[outpos + 2] = b't'
159 nits[outpos + 3] = b';'
160 outpos += 4
161 else if c == b'&' then
162 nits[outpos] = b'&'
163 nits[outpos + 1] = b'a'
164 nits[outpos + 2] = b'm'
165 nits[outpos + 3] = b'p'
166 nits[outpos + 4] = b';'
167 outpos += 5
168 else if c == b'"' then
169 nits[outpos] = b'&'
170 nits[outpos + 1] = b'#'
171 nits[outpos + 2] = b'3'
172 nits[outpos + 3] = b'4'
173 nits[outpos + 4] = b';'
174 outpos += 5
175 else if c == b'\'' then
176 nits[outpos] = b'&'
177 nits[outpos + 1] = b'#'
178 nits[outpos + 2] = b'3'
179 nits[outpos + 3] = b'9'
180 nits[outpos + 4] = b';'
181 outpos += 5
182 else if c == 0x2Fu8 then
183 nits[outpos] = b'&'
184 nits[outpos + 1] = b'#'
185 nits[outpos + 2] = b'4'
186 nits[outpos + 3] = b'7'
187 nits[outpos + 4] = b';'
188 outpos += 5
189 else
190 nits[outpos] = c
191 outpos += 1
192 end
193 pos += 1
194 end
195 var s = new FlatString.with_infos(nits, nlen, 0)
196 return s
197 end
198
199 # By escaping `self` to C, how many more bytes will be needed ?
200 #
201 # This enables a double-optimization in `escape_to_c` since if this
202 # method returns 0, then `self` does not need escaping and can be
203 # returned as-is
204 fun chars_to_escape_to_c: Int do
205 var its = _items
206 var max = last_byte
207 var pos = first_byte
208 var req_esc = 0
209 while pos <= max do
210 var c = its[pos]
211 if c == b'\n' then
212 req_esc += 1
213 else if c == b'\t' then
214 req_esc += 1
215 else if c == b'"' then
216 req_esc += 1
217 else if c == b'\'' then
218 req_esc += 1
219 else if c == b'\\' then
220 req_esc += 1
221 else if c == 0x3Fu8 then
222 var j = pos + 1
223 if j < length then
224 var next = its[j]
225 # We ignore `??'` because it will be escaped as `??\'`.
226 if
227 next == 0x21u8 or
228 next == 0x28u8 or
229 next == 0x29u8 or
230 next == 0x2Du8 or
231 next == 0x2Fu8 or
232 next == 0x3Cu8 or
233 next == 0x3Du8 or
234 next == 0x3Eu8
235 then req_esc += 1
236 end
237 else if c < 32u8 then
238 req_esc += 3
239 end
240 pos += 1
241 end
242 return req_esc
243 end
244
245 redef fun escape_to_c do
246 var ln_extra = chars_to_escape_to_c
247 if ln_extra == 0 then return self.to_s
248 var its = _items
249 var max = last_byte
250 var nlen = _byte_length + ln_extra
251 var nns = new CString(nlen)
252 var pos = first_byte
253 var opos = 0
254 while pos <= max do
255 var c = its[pos]
256 # Special codes:
257 #
258 # Any byte with value < 32 is a control character
259 # All their uses will be replaced by their octal
260 # value in C.
261 #
262 # There are two exceptions however:
263 #
264 # * 0x09 => \t
265 # * 0x0A => \n
266 #
267 # Aside from the code points above, the following are:
268 #
269 # * 0x22 => \"
270 # * 0x27 => \'
271 # * 0x5C => \\
272 if c == b'\t' then
273 nns[opos] = b'\\'
274 nns[opos + 1] = b't'
275 opos += 2
276 else if c == b'\n' then
277 nns[opos] = b'\\'
278 nns[opos + 1] = b'n'
279 opos += 2
280 else if c == b'"' then
281 nns[opos] = b'\\'
282 nns[opos + 1] = b'"'
283 opos += 2
284 else if c == b'\'' then
285 nns[opos] = b'\\'
286 nns[opos + 1] = b'\''
287 opos += 2
288 else if c == b'\\' then
289 nns[opos] = b'\\'
290 nns[opos + 1] = b'\\'
291 opos += 2
292 else if c == 0x3Fu8 then
293 var j = pos + 1
294 if j < length then
295 var next = its[j]
296 # We ignore `??'` because it will be escaped as `??\'`.
297 if
298 next == 0x21u8 or
299 next == 0x28u8 or
300 next == 0x29u8 or
301 next == 0x2Du8 or
302 next == 0x2Fu8 or
303 next == 0x3Cu8 or
304 next == 0x3Du8 or
305 next == 0x3Eu8
306 then
307 nns[opos] = 0x5Cu8
308 opos += 1
309 end
310 end
311 nns[opos] = 0x3Fu8
312 opos += 1
313 else if c < 32u8 then
314 nns[opos] = b'\\'
315 nns[opos + 1] = b'0'
316 nns[opos + 2] = ((c & 0x38u8) >> 3) + b'0'
317 nns[opos + 3] = (c & 0x07u8) + b'0'
318 opos += 4
319 else
320 nns[opos] = c
321 opos += 1
322 end
323 pos += 1
324 end
325 return nns.to_s_unsafe(nlen, copy=false, clean=false)
326 end
327
328 redef fun [](index) do
329 var len = _length
330
331 # Statistically:
332 # * ~70% want the next char
333 # * ~23% want the previous
334 # * ~7% want the same char
335 #
336 # So it makes sense to shortcut early. And early is here.
337 var dpos = index - _position
338 var b = _bytepos
339 if dpos == 1 and index < len - 1 then
340 var its = _items
341 var c = its[b]
342 if c & 0x80u8 == 0x00u8 then
343 # We want the next, and current is easy.
344 # So next is easy to find!
345 b += 1
346 _position = index
347 _bytepos = b
348 # The rest will be done by `dpos==0` bellow.
349 dpos = 0
350 end
351 else if dpos == -1 and index > 1 then
352 var its = _items
353 var c = its[b-1]
354 if c & 0x80u8 == 0x00u8 then
355 # We want the previous, and it is easy.
356 b -= 1
357 dpos = 0
358 _position = index
359 _bytepos = b
360 return c.ascii
361 end
362 end
363 if dpos == 0 then
364 # We know what we want (+0 or +1) just get it now!
365 var its = _items
366 var c = its[b]
367 if c & 0x80u8 == 0x00u8 then return c.ascii
368 return items.char_at(b)
369 end
370
371 assert index >= 0 and index < len
372 return fetch_char_at(index)
373 end
374
375 # Gets a `Char` at `index` in `self`
376 #
377 # WARNING: Use at your own risks as no bound-checking is done
378 fun fetch_char_at(index: Int): Char do
379 var i = char_to_byte_index(index)
380 var items = _items
381 var b = items[i]
382 if b & 0x80u8 == 0x00u8 then return b.ascii
383 return items.char_at(i)
384 end
385
386 # If `self` contains only digits and alpha <= 'f', return the corresponding integer.
387 #
388 # assert "ff".to_hex == 255
389 redef fun to_hex(pos, ln) do
390 var res = 0
391 if pos == null then pos = 0
392 if ln == null then ln = length - pos
393 pos = char_to_byte_index(pos)
394 var its = _items
395 var max = pos + ln
396 for i in [pos .. max[ do
397 res <<= 4
398 res += its[i].ascii.from_hex
399 end
400 return res
401 end
402
403 redef fun copy_to_native(dst, n, src_off, dst_off) do
404 _items.copy_to(dst, n, first_byte + src_off, dst_off)
405 end
406 end
407
408 # Immutable strings of characters.
409 abstract class FlatString
410 super FlatText
411 super String
412
413 # Index at which `self` begins in `_items`, inclusively
414 redef var first_byte is noinit
415
416 redef fun chars do return new FlatStringCharView(self)
417
418 redef fun bytes do return new FlatStringByteView(self)
419
420 redef fun to_cstring do
421 var blen = _byte_length
422 var new_items = new CString(blen + 1)
423 _items.copy_to(new_items, blen, _first_byte, 0)
424 new_items[blen] = 0u8
425 return new_items
426 end
427
428 redef fun reversed do
429 var b = new FlatBuffer.with_capacity(_byte_length + 1)
430 var i = _length - 1
431 while i >= 0 do
432 b.add self.fetch_char_at(i)
433 i -= 1
434 end
435 var s = b.to_s.as(FlatString)
436 s._length = self._length
437 return s
438 end
439
440 redef fun fast_cstring do return _items.fast_cstring(_first_byte)
441
442 redef fun substring(from, count)
443 do
444 if count <= 0 then return ""
445
446 if from < 0 then
447 count += from
448 if count <= 0 then return ""
449 from = 0
450 end
451
452 var ln = _length
453 if (count + from) > ln then count = ln - from
454 if count <= 0 then return ""
455 var end_index = from + count - 1
456 return substring_impl(from, count, end_index)
457 end
458
459 private fun substring_impl(from, count, end_index: Int): String do
460 var cache = _position
461 var dfrom = (cache - from).abs
462 var dend = (end_index - from).abs
463
464 var bytefrom: Int
465 var byteto: Int
466 if dfrom < dend then
467 bytefrom = char_to_byte_index(from)
468 byteto = char_to_byte_index(end_index)
469 else
470 byteto = char_to_byte_index(end_index)
471 bytefrom = char_to_byte_index(from)
472 end
473
474 var its = _items
475 byteto += its.length_of_char_at(byteto) - 1
476
477 var s = new FlatString.full(its, byteto - bytefrom + 1, bytefrom, count)
478 return s
479 end
480
481 redef fun empty do return "".as(FlatString)
482
483 redef fun to_upper
484 do
485 var outstr = new FlatBuffer.with_capacity(self._byte_length + 1)
486
487 var mylen = _length
488 var pos = 0
489
490 while pos < mylen do
491 outstr.add(chars[pos].to_upper)
492 pos += 1
493 end
494
495 return outstr.to_s
496 end
497
498 redef fun to_lower
499 do
500 var outstr = new FlatBuffer.with_capacity(self._byte_length + 1)
501
502 var mylen = _length
503 var pos = 0
504
505 while pos < mylen do
506 outstr.add(chars[pos].to_lower)
507 pos += 1
508 end
509
510 return outstr.to_s
511 end
512
513 redef fun output
514 do
515 for i in chars do i.output
516 end
517
518 ##################################################
519 # String Specific Methods #
520 ##################################################
521
522 # Low-level creation of a new string with minimal data.
523 #
524 # `_items` will be used as is, without copy, to retrieve the characters of the string.
525 # Aliasing issues is the responsibility of the caller.
526 private new with_infos(items: CString, byte_length, from: Int)
527 do
528 var len = items.utf8_length(from, byte_length)
529 if byte_length == len then return new ASCIIFlatString.full_data(items, byte_length, from, len)
530 return new UnicodeFlatString.full_data(items, byte_length, from, len)
531 end
532
533 # Low-level creation of a new string with all the data.
534 #
535 # `_items` will be used as is, without copy, to retrieve the characters of the string.
536 # Aliasing issues is the responsibility of the caller.
537 private new full(items: CString, byte_length, from, length: Int)
538 do
539 if byte_length == length then return new ASCIIFlatString.full_data(items, byte_length, from, length)
540 return new UnicodeFlatString.full_data(items, byte_length, from, length)
541 end
542
543 redef fun ==(other)
544 do
545 if not other isa FlatText then return super
546
547 if self.object_id == other.object_id then return true
548
549 var my_length = _byte_length
550
551 if other._byte_length != my_length then return false
552
553 var my_index = _first_byte
554 var its_index = other.first_byte
555
556 var last_iteration = my_index + my_length
557
558 var its_items = other._items
559 var my_items = self._items
560
561 while my_index < last_iteration do
562 if my_items[my_index] != its_items[its_index] then return false
563 my_index += 1
564 its_index += 1
565 end
566
567 return true
568 end
569
570 redef fun <(other)
571 do
572 if not other isa FlatText then return super
573
574 if self.object_id == other.object_id then return false
575
576 var myits = _items
577 var itsits = other._items
578
579 var mbt = _byte_length
580 var obt = other.byte_length
581
582 var minln = if mbt < obt then mbt else obt
583 var mst = _first_byte
584 var ost = other.first_byte
585
586 for i in [0 .. minln[ do
587 var my_curr_char = myits[mst]
588 var its_curr_char = itsits[ost]
589
590 if my_curr_char > its_curr_char then return false
591 if my_curr_char < its_curr_char then return true
592
593 mst += 1
594 ost += 1
595 end
596
597 return mbt < obt
598 end
599
600 redef fun +(o) do
601 var s = o.to_s
602 var slen = s.byte_length
603 var mlen = _byte_length
604 var nlen = mlen + slen
605 var mits = _items
606 var mifrom = _first_byte
607 if s isa FlatText then
608 var sits = s._items
609 var sifrom = s.first_byte
610 var ns = new CString(nlen + 1)
611 mits.copy_to(ns, mlen, mifrom, 0)
612 sits.copy_to(ns, slen, sifrom, mlen)
613 return new FlatString.full(ns, nlen, 0, _length + o.length)
614 else
615 abort
616 end
617 end
618
619 redef fun *(i) do
620 var mybtlen = _byte_length
621 var new_byte_length = mybtlen * i
622 var mylen = _length
623 var newlen = mylen * i
624 var its = _items
625 var fb = _first_byte
626 var ns = new CString(new_byte_length + 1)
627 ns[new_byte_length] = 0u8
628 var offset = 0
629 while i > 0 do
630 its.copy_to(ns, mybtlen, fb, offset)
631 offset += mybtlen
632 i -= 1
633 end
634 return new FlatString.full(ns, new_byte_length, 0, newlen)
635 end
636
637 redef fun hash
638 do
639 if hash_cache == null then
640 # djb2 hash algorithm
641 var h = 5381
642 var i = _first_byte
643
644 var my_items = _items
645 var max = last_byte
646
647 while i <= max do
648 h = (h << 5) + h + my_items[i].to_i
649 i += 1
650 end
651
652 hash_cache = h
653 end
654
655 return hash_cache.as(not null)
656 end
657
658 redef fun substrings do return new FlatSubstringsIter(self)
659 end
660
661 # Regular Nit UTF-8 strings
662 private class UnicodeFlatString
663 super FlatString
664
665 init full_data(items: CString, byte_length, from, length: Int) do
666 self._items = items
667 self._length = length
668 self._byte_length = byte_length
669 _first_byte = from
670 _bytepos = from
671 end
672
673 redef fun substring_from(from) do
674 if from >= self._length then return empty
675 if from <= 0 then return self
676 var c = char_to_byte_index(from)
677 var st = c - _first_byte
678 var fln = byte_length - st
679 return new FlatString.full(items, fln, c, _length - from)
680 end
681 end
682
683 # Special cases of String where all the characters are ASCII-based
684 #
685 # Optimizes access operations to O(1) complexity.
686 private class ASCIIFlatString
687 super FlatString
688
689 init full_data(items: CString, byte_length, from, length: Int) do
690 self._items = items
691 self._length = length
692 self._byte_length = byte_length
693 _first_byte = from
694 _bytepos = from
695 end
696
697 redef fun [](idx) do
698 assert idx < _byte_length and idx >= 0
699 return _items[idx + _first_byte].ascii
700 end
701
702 redef fun substring(from, count) do
703 var ln = _length
704 if count <= 0 then return ""
705 if (count + from) > ln then count = ln - from
706 if count <= 0 then return ""
707 if from < 0 then
708 count += from
709 if count <= 0 then return ""
710 from = 0
711 end
712 return new ASCIIFlatString.full_data(_items, count, from + _first_byte, count)
713 end
714
715 redef fun reversed do
716 var b = new FlatBuffer.with_capacity(_byte_length + 1)
717 var i = _length - 1
718 while i >= 0 do
719 b.add self[i]
720 i -= 1
721 end
722 var s = b.to_s.as(FlatString)
723 return s
724 end
725
726 redef fun char_to_byte_index(index) do return index + _first_byte
727
728 redef fun substring_impl(from, count, end_index) do
729 return new ASCIIFlatString.full_data(_items, count, from + _first_byte, count)
730 end
731
732 redef fun fetch_char_at(i) do return _items[i + _first_byte].ascii
733 end
734
735 private class FlatStringCharReverseIterator
736 super IndexedIterator[Char]
737
738 var target: FlatString
739
740 var curr_pos: Int
741
742 redef fun is_ok do return curr_pos >= 0
743
744 redef fun item do return target[curr_pos]
745
746 redef fun next do curr_pos -= 1
747
748 redef fun index do return curr_pos
749
750 end
751
752 private class FlatStringCharIterator
753 super IndexedIterator[Char]
754
755 var target: FlatString
756
757 var max: Int is noautoinit
758
759 var curr_pos: Int
760
761 init do max = target._length - 1
762
763 redef fun is_ok do return curr_pos <= max
764
765 redef fun item do return target[curr_pos]
766
767 redef fun next do curr_pos += 1
768
769 redef fun index do return curr_pos
770
771 end
772
773 private class FlatStringCharView
774 super StringCharView
775
776 redef type SELFTYPE: FlatString
777
778 redef fun [](index) do return target[index]
779
780 redef fun iterator_from(start) do return new FlatStringCharIterator(target, start)
781
782 redef fun reverse_iterator_from(start) do return new FlatStringCharReverseIterator(target, start)
783
784 end
785
786 private class FlatStringByteReverseIterator
787 super IndexedIterator[Byte]
788
789 var target: FlatString
790
791 var target_items: CString is noautoinit
792
793 var curr_pos: Int
794
795 init
796 do
797 var tgt = target
798 target_items = tgt._items
799 curr_pos += tgt._first_byte
800 end
801
802 redef fun is_ok do return curr_pos >= target._first_byte
803
804 redef fun item do return target_items[curr_pos]
805
806 redef fun next do curr_pos -= 1
807
808 redef fun index do return curr_pos - target._first_byte
809
810 end
811
812 private class FlatStringByteIterator
813 super IndexedIterator[Byte]
814
815 var target: FlatString
816
817 var target_items: CString is noautoinit
818
819 var curr_pos: Int
820
821 init
822 do
823 var tgt = target
824 target_items = tgt._items
825 curr_pos += tgt._first_byte
826 end
827
828 redef fun is_ok do return curr_pos <= target.last_byte
829
830 redef fun item do return target_items[curr_pos]
831
832 redef fun next do curr_pos += 1
833
834 redef fun index do return curr_pos - target._first_byte
835
836 end
837
838 private class FlatStringByteView
839 super StringByteView
840
841 redef type SELFTYPE: FlatString
842
843 redef fun [](index)
844 do
845 # Check that the index (+ _first_byte) is not larger than last_byte
846 # In other terms, if the index is valid
847 var target = _target
848 assert index >= 0 and index < target._byte_length
849 var ind = index + target._first_byte
850 return target._items[ind]
851 end
852
853 redef fun iterator_from(start) do return new FlatStringByteIterator(target, start)
854
855 redef fun reverse_iterator_from(start) do return new FlatStringByteReverseIterator(target, start)
856
857 end
858
859 redef class Buffer
860 redef new do return new FlatBuffer
861
862 redef new with_cap(i) do return new FlatBuffer.with_capacity(i)
863 end
864
865 # Mutable strings of characters.
866 class FlatBuffer
867 super FlatText
868 super Buffer
869
870 redef fun chars do return new FlatBufferCharView(self)
871
872 redef fun bytes do return new FlatBufferByteView(self)
873
874 private var capacity = 0
875
876 redef fun fast_cstring do return _items.fast_cstring(0)
877
878 redef fun substrings do return new FlatSubstringsIter(self)
879
880 # Re-copies the `CString` into a new one and sets it as the new `Buffer`
881 #
882 # This happens when an operation modifies the current `Buffer` and
883 # the Copy-On-Write flag `written` is set at true.
884 private fun reset do
885 var nns = new CString(capacity)
886 if _byte_length != 0 then _items.copy_to(nns, _byte_length, 0, 0)
887 _items = nns
888 written = false
889 end
890
891 # Shifts the content of the buffer by `len` bytes to the right, starting at byte `from`
892 #
893 # Internal only, does not modify _byte_length or length, this is the caller's responsability
894 private fun rshift_bytes(from: Int, len: Int) do
895 var oit = _items
896 var nit = _items
897 var bt = _byte_length
898 if bt + len > capacity then
899 capacity = capacity * 2 + 2
900 nit = new CString(capacity)
901 oit.copy_to(nit, 0, 0, from)
902 end
903 oit.copy_to(nit, bt - from, from, from + len)
904 end
905
906 # Shifts the content of the buffer by `len` bytes to the left, starting at `from`
907 #
908 # Internal only, does not modify _byte_length or length, this is the caller's responsability
909 private fun lshift_bytes(from: Int, len: Int) do
910 var it = _items
911 it.copy_to(it, _byte_length - from, from, from - len)
912 end
913
914 redef fun []=(index, item)
915 do
916 assert index >= 0 and index <= _length
917 if written then reset
918 if index == _length then
919 add item
920 return
921 end
922 var it = _items
923 var ip = it.char_to_byte_index(index)
924 var c = it.char_at(ip)
925 var clen = c.u8char_len
926 var itemlen = item.u8char_len
927 var size_diff = itemlen - clen
928 if size_diff > 0 then
929 rshift_bytes(ip + clen, size_diff)
930 else if size_diff < 0 then
931 lshift_bytes(ip + clen, -size_diff)
932 end
933 _byte_length += size_diff
934 it.set_char_at(ip, item)
935 end
936
937 redef fun insert(s, pos) do
938 assert pos >= 0 and pos <= length
939 if pos == length then
940 append s
941 return
942 end
943 var slen = s.byte_length
944 enlarge(byte_length + slen)
945 var it = _items
946 var shpos = it.char_to_byte_index(pos)
947 rshift_bytes(shpos, slen)
948 s.copy_to_native(it, slen, 0, shpos)
949 length += s.length
950 byte_length += slen
951 end
952
953 redef fun insert_char(c, pos) do
954 assert pos >= 0 and pos <= length
955 if pos == length then
956 add c
957 return
958 end
959 var clen = c.u8char_len
960 enlarge(byte_length + clen)
961 var it = _items
962 var shpos = it.char_to_byte_index(pos)
963 rshift_bytes(shpos, clen)
964 it.set_char_at(shpos, c)
965 length += 1
966 byte_length += clen
967 end
968
969 redef fun add(c)
970 do
971 if written then reset
972 var clen = c.u8char_len
973 var bt = _byte_length
974 enlarge(bt + clen)
975 _items.set_char_at(bt, c)
976 _byte_length += clen
977 _length += 1
978 end
979
980 redef fun clear do
981 _byte_length = 0
982 _length = 0
983 if written then
984 _capacity = 16
985 reset
986 end
987 end
988
989 redef fun empty do return new Buffer
990
991 redef fun enlarge(cap)
992 do
993 var c = capacity
994 if cap <= c then return
995 if c <= 16 then c = 16
996 while c <= cap do c = c * 2
997 # The COW flag can be set at false here, since
998 # it does a copy of the current `Buffer`
999 written = false
1000 var bln = _byte_length
1001 var a = new CString(c)
1002 if bln > 0 then
1003 var it = _items
1004 if bln > 0 then it.copy_to(a, bln, 0, 0)
1005 end
1006 _items = a
1007 capacity = c
1008 end
1009
1010 redef fun to_s
1011 do
1012 written = true
1013 var bln = _byte_length
1014 if bln == 0 then _items = new CString(1)
1015 return new FlatString.full(_items, bln, 0, _length)
1016 end
1017
1018 redef fun to_cstring
1019 do
1020 var bln = _byte_length
1021 var new_native = new CString(bln + 1)
1022 new_native[bln] = 0u8
1023 if _length > 0 then _items.copy_to(new_native, bln, 0, 0)
1024 return new_native
1025 end
1026
1027 # Create a new empty string.
1028 init do end
1029
1030 # Low-level creation a new buffer with given data.
1031 #
1032 # `_items` will be used as is, without copy, to store the characters of the buffer.
1033 # Aliasing issues is the responsibility of the caller.
1034 #
1035 # If `_items` is shared, `written` should be set to true after the creation
1036 # so that a modification will do a copy-on-write.
1037 private init with_infos(items: CString, capacity, byte_length, length: Int)
1038 do
1039 self._items = items
1040 self.capacity = capacity
1041 self._byte_length = byte_length
1042 self._length = length
1043 end
1044
1045 # Create a new string copied from `s`.
1046 init from(s: Text)
1047 do
1048 _items = new CString(s.byte_length)
1049 for i in s.substrings do i._items.copy_to(_items, i._byte_length, first_byte, 0)
1050 _byte_length = s.byte_length
1051 _length = s.length
1052 _capacity = _byte_length
1053 end
1054
1055 # Create a new empty string with a given capacity.
1056 init with_capacity(cap: Int)
1057 do
1058 assert cap >= 0
1059 _items = new CString(cap)
1060 capacity = cap
1061 _byte_length = 0
1062 end
1063
1064 redef fun append(s)
1065 do
1066 if s.is_empty then return
1067 var sl = s.byte_length
1068 var nln = _byte_length + sl
1069 enlarge(nln)
1070 if s isa FlatText then
1071 s._items.copy_to(_items, sl, s.first_byte, _byte_length)
1072 else
1073 for i in s.substrings do append i
1074 return
1075 end
1076 _byte_length = nln
1077 _length += s.length
1078 end
1079
1080 # Copies the content of self in `dest`
1081 fun copy(start: Int, len: Int, dest: Buffer, new_start: Int)
1082 do
1083 var self_chars = self.chars
1084 var dest_chars = dest.chars
1085 for i in [0..len-1] do
1086 dest_chars[new_start+i] = self_chars[start+i]
1087 end
1088 end
1089
1090 redef fun substring(from, count)
1091 do
1092 assert count >= 0
1093 if from < 0 then from = 0
1094 if (from + count) > _length then count = _length - from
1095 if count <= 0 then return new Buffer
1096 var its = _items
1097 var bytefrom = its.char_to_byte_index(from)
1098 var byteto = its.char_to_byte_index(count + from - 1)
1099 byteto += its.char_at(byteto).u8char_len - 1
1100 var byte_length = byteto - bytefrom + 1
1101 var r_items = new CString(byte_length)
1102 its.copy_to(r_items, byte_length, bytefrom, 0)
1103 return new FlatBuffer.with_infos(r_items, byte_length, byte_length, count)
1104 end
1105
1106 redef fun append_substring_impl(s, from, length) do
1107 if length <= 0 then return
1108 if not s isa FlatText then
1109 super
1110 return
1111 end
1112 var sits = s._items
1113 var bytest = s.char_to_byte_index(from)
1114 var bytend = s.char_to_byte_index(from + length - 1)
1115 var btln = bytend - bytest + sits.char_at(bytend).u8char_len
1116 enlarge(btln + _byte_length)
1117 sits.copy_to(_items, btln, bytest, _byte_length)
1118 _byte_length += btln
1119 _length += length
1120 end
1121
1122 redef fun remove_at(p, len) do
1123 if len == null then len = 1
1124 if len == 0 then return
1125 var its = _items
1126 var bst = char_to_byte_index(p)
1127 var bend = char_to_byte_index(p + len - 1)
1128 bend += its.char_at(bend).u8char_len
1129 var blen = bend - bst
1130 lshift_bytes(bend, bend - bst)
1131 byte_length -= blen
1132 length -= len
1133 end
1134
1135 redef fun reverse
1136 do
1137 written = false
1138 var ns = new FlatBuffer.with_capacity(capacity)
1139 for i in chars.reverse_iterator do ns.add i
1140 _items = ns._items
1141 end
1142
1143 redef fun times(repeats)
1144 do
1145 var bln = _byte_length
1146 var x = new FlatString.full(_items, bln, 0, _length)
1147 for i in [1 .. repeats[ do
1148 append(x)
1149 end
1150 end
1151
1152 redef fun upper
1153 do
1154 if written then reset
1155 for i in [0 .. _length[ do self[i] = self[i].to_upper
1156 end
1157
1158 redef fun lower
1159 do
1160 if written then reset
1161 for i in [0 .. _length[ do self[i] = self[i].to_lower
1162 end
1163 end
1164
1165 private class FlatBufferByteReverseIterator
1166 super IndexedIterator[Byte]
1167
1168 var target: FlatBuffer
1169
1170 var target_items: CString is noautoinit
1171
1172 var curr_pos: Int
1173
1174 init do target_items = target._items
1175
1176 redef fun index do return curr_pos
1177
1178 redef fun is_ok do return curr_pos >= 0
1179
1180 redef fun item do return target_items[curr_pos]
1181
1182 redef fun next do curr_pos -= 1
1183
1184 end
1185
1186 private class FlatBufferByteView
1187 super BufferByteView
1188
1189 redef type SELFTYPE: FlatBuffer
1190
1191 redef fun [](index) do return target._items[index]
1192
1193 redef fun iterator_from(pos) do return new FlatBufferByteIterator(target, pos)
1194
1195 redef fun reverse_iterator_from(pos) do return new FlatBufferByteReverseIterator(target, pos)
1196
1197 end
1198
1199 private class FlatBufferByteIterator
1200 super IndexedIterator[Byte]
1201
1202 var target: FlatBuffer
1203
1204 var target_items: CString is noautoinit
1205
1206 var curr_pos: Int
1207
1208 init do target_items = target._items
1209
1210 redef fun index do return curr_pos
1211
1212 redef fun is_ok do return curr_pos < target._byte_length
1213
1214 redef fun item do return target_items[curr_pos]
1215
1216 redef fun next do curr_pos += 1
1217
1218 end
1219
1220 private class FlatBufferCharReverseIterator
1221 super IndexedIterator[Char]
1222
1223 var target: FlatBuffer
1224
1225 var curr_pos: Int
1226
1227 redef fun index do return curr_pos
1228
1229 redef fun is_ok do return curr_pos >= 0
1230
1231 redef fun item do return target[curr_pos]
1232
1233 redef fun next do curr_pos -= 1
1234
1235 end
1236
1237 private class FlatBufferCharView
1238 super BufferCharView
1239
1240 redef type SELFTYPE: FlatBuffer
1241
1242 redef fun [](index) do return target[index]
1243
1244 redef fun []=(index, item)
1245 do
1246 assert index >= 0 and index <= length
1247 if index == length then
1248 add(item)
1249 return
1250 end
1251 target[index] = item
1252 end
1253
1254 redef fun push(c)
1255 do
1256 target.add(c)
1257 end
1258
1259 redef fun add(c)
1260 do
1261 target.add(c)
1262 end
1263
1264 fun enlarge(cap: Int)
1265 do
1266 target.enlarge(cap)
1267 end
1268
1269 redef fun append(s)
1270 do
1271 var s_length = s.length
1272 if target.capacity < s.length then enlarge(s_length + target._length)
1273 for i in s do target.add i
1274 end
1275
1276 redef fun iterator_from(pos) do return new FlatBufferCharIterator(target, pos)
1277
1278 redef fun reverse_iterator_from(pos) do return new FlatBufferCharReverseIterator(target, pos)
1279
1280 end
1281
1282 private class FlatBufferCharIterator
1283 super IndexedIterator[Char]
1284
1285 var target: FlatBuffer
1286
1287 var max: Int is noautoinit
1288
1289 var curr_pos: Int
1290
1291 init do max = target._length - 1
1292
1293 redef fun index do return curr_pos
1294
1295 redef fun is_ok do return curr_pos <= max
1296
1297 redef fun item do return target[curr_pos]
1298
1299 redef fun next do curr_pos += 1
1300
1301 end
1302
1303 redef class CString
1304
1305 # Get a `String` from the data at `self` copied into Nit memory
1306 #
1307 # Require: `self` is a null-terminated string.
1308 redef fun to_s do return to_s_unsafe
1309
1310 # Get a `String` from `byte_length` bytes at `self` copied into Nit memory
1311 #
1312 # The string is cleaned.
1313 fun to_s_with_length(byte_length: Int): String do return to_s_unsafe(byte_length)
1314
1315 redef fun to_s_unsafe(byte_length, char_length, copy, clean)
1316 do
1317 byte_length = byte_length or else cstring_length
1318 clean = clean or else true
1319 copy = copy or else true
1320
1321 # Clean?
1322 var str = null
1323 if clean then
1324 str = clean_utf8(byte_length)
1325 char_length = str.length
1326 else
1327 char_length = char_length or else utf8_length(0, byte_length)
1328 end
1329
1330 # Copy? (if not already copied by `clean_utf8`)
1331 if copy and (str == null or str.items == self) then
1332 var new_cstr = new CString(byte_length + 1)
1333 copy_to(new_cstr, byte_length, 0, 0)
1334 new_cstr[byte_length] = 0u8
1335 str = new FlatString.full(new_cstr, byte_length, 0, char_length)
1336 end
1337
1338 if str == null then
1339 str = new FlatString.full(self, byte_length, 0, char_length)
1340 end
1341
1342 return str
1343 end
1344
1345 # Cleans a CString if necessary
1346 fun clean_utf8(len: Int): FlatString do
1347 var replacements: nullable Array[Int] = null
1348 var end_length = len
1349 var pos = 0
1350 var chr_ln = 0
1351 var rem = len
1352 while rem > 0 do
1353 while rem >= 4 do
1354 var i = fetch_4_chars(pos)
1355 if i & 0x80808080u32 != 0u32 then break
1356 pos += 4
1357 chr_ln += 4
1358 rem -= 4
1359 end
1360 if rem == 0 then break
1361 var b = self[pos]
1362 if b & 0x80u8 == 0x00u8 then
1363 pos += 1
1364 chr_ln += 1
1365 rem -= 1
1366 continue
1367 end
1368 var nxst = length_of_char_at(pos)
1369 var ok_st: Bool
1370 if nxst == 1 then
1371 ok_st = b & 0x80u8 == 0u8
1372 else if nxst == 2 then
1373 ok_st = b & 0xE0u8 == 0xC0u8
1374 else if nxst == 3 then
1375 ok_st = b & 0xF0u8 == 0xE0u8
1376 else
1377 ok_st = b & 0xF8u8 == 0xF0u8
1378 end
1379 if not ok_st then
1380 if replacements == null then replacements = new Array[Int]
1381 replacements.add pos
1382 end_length += 2
1383 pos += 1
1384 rem -= 1
1385 chr_ln += 1
1386 continue
1387 end
1388 var ok_c: Bool
1389 var c = char_at(pos)
1390 var cp = c.code_point
1391 if nxst == 1 then
1392 ok_c = cp >= 0 and cp <= 0x7F
1393 else if nxst == 2 then
1394 ok_c = cp >= 0x80 and cp <= 0x7FF
1395 else if nxst == 3 then
1396 ok_c = cp >= 0x800 and cp <= 0xFFFF
1397 ok_c = ok_c and not (cp >= 0xD800 and cp <= 0xDFFF) and cp != 0xFFFE and cp != 0xFFFF
1398 else
1399 ok_c = cp >= 0x10000 and cp <= 0x10FFFF
1400 end
1401 if not ok_c then
1402 if replacements == null then replacements = new Array[Int]
1403 replacements.add pos
1404 end_length += 2
1405 pos += 1
1406 chr_ln += 1
1407 rem -= 1
1408 continue
1409 end
1410 var clen = c.u8char_len
1411 pos += clen
1412 rem -= clen
1413 chr_ln += 1
1414 end
1415 var ret = self
1416 if end_length != len then
1417 ret = new CString(end_length)
1418 var old_repl = 0
1419 var off = 0
1420 var repls = replacements.as(not null)
1421 var r = repls.items.as(not null)
1422 var imax = repls.length
1423 for i in [0 .. imax[ do
1424 var repl_pos = r[i]
1425 var chkln = repl_pos - old_repl
1426 copy_to(ret, chkln, old_repl, off)
1427 off += chkln
1428 ret[off] = 0xEFu8
1429 ret[off + 1] = 0xBFu8
1430 ret[off + 2] = 0xBDu8
1431 old_repl = repl_pos + 1
1432 off += 3
1433 end
1434 copy_to(ret, len - old_repl, old_repl, off)
1435 end
1436 return new FlatString.full(ret, end_length, 0, chr_ln)
1437 end
1438
1439 # Sets the next bytes at position `pos` to the value of `c`, encoded in UTF-8
1440 #
1441 # Very unsafe, make sure to have room for this char prior to calling this function.
1442 private fun set_char_at(pos: Int, c: Char) do
1443 var cp = c.code_point
1444 if cp < 128 then
1445 self[pos] = cp.to_b
1446 return
1447 end
1448 var ln = c.u8char_len
1449 if ln == 2 then
1450 self[pos] = (0xC0 | ((cp & 0x7C0) >> 6)).to_b
1451 self[pos + 1] = (0x80 | (cp & 0x3F)).to_b
1452 else if ln == 3 then
1453 self[pos] = (0xE0 | ((cp & 0xF000) >> 12)).to_b
1454 self[pos + 1] = (0x80 | ((cp & 0xFC0) >> 6)).to_b
1455 self[pos + 2] = (0x80 | (cp & 0x3F)).to_b
1456 else if ln == 4 then
1457 self[pos] = (0xF0 | ((cp & 0x1C0000) >> 18)).to_b
1458 self[pos + 1] = (0x80 | ((cp & 0x3F000) >> 12)).to_b
1459 self[pos + 2] = (0x80 | ((cp & 0xFC0) >> 6)).to_b
1460 self[pos + 3] = (0x80 | (cp & 0x3F)).to_b
1461 end
1462 end
1463 end
1464
1465 redef class Int
1466 # return displayable int in base 10 and signed
1467 #
1468 # assert 1.to_s == "1"
1469 # assert (-123).to_s == "-123"
1470 redef fun to_s do
1471 # Fast case for common numbers
1472 if self == 0 then return "0"
1473 if self == 1 then return "1"
1474
1475 var nslen = int_to_s_len
1476 var ns = new CString(nslen + 1)
1477 ns[nslen] = 0u8
1478 native_int_to_s(ns, nslen + 1)
1479 return new FlatString.full(ns, nslen, 0, nslen)
1480 end
1481 end
1482
1483 redef class Array[E]
1484
1485 # Fast implementation
1486 redef fun plain_to_s
1487 do
1488 var l = _length
1489 if l == 0 then return ""
1490 var its = _items.as(not null)
1491 var first = its[0]
1492 if l == 1 then if first == null then return "" else return first.to_s
1493 var na = new NativeArray[String](l)
1494 var i = 0
1495 var sl = 0
1496 var mypos = 0
1497 while i < l do
1498 var itsi = its[i]
1499 if itsi == null then
1500 i += 1
1501 continue
1502 end
1503 var tmp = itsi.to_s
1504 sl += tmp.byte_length
1505 na[mypos] = tmp
1506 i += 1
1507 mypos += 1
1508 end
1509 var ns = new CString(sl + 1)
1510 ns[sl] = 0u8
1511 i = 0
1512 var off = 0
1513 while i < mypos do
1514 var tmp = na[i]
1515 if tmp isa FlatString then
1516 var tpl = tmp._byte_length
1517 tmp._items.copy_to(ns, tpl, tmp._first_byte, off)
1518 off += tpl
1519 else
1520 for j in tmp.substrings do
1521 var s = j.as(FlatString)
1522 var slen = s._byte_length
1523 s._items.copy_to(ns, slen, s._first_byte, off)
1524 off += slen
1525 end
1526 end
1527 i += 1
1528 end
1529 return new FlatString.with_infos(ns, sl, 0)
1530 end
1531 end
1532
1533 redef class NativeArray[E]
1534 redef fun native_to_s do
1535 assert self isa NativeArray[String]
1536 var l = length
1537 var na = self
1538 var i = 0
1539 var sl = 0
1540 var mypos = 0
1541 while i < l do
1542 sl += na[i].byte_length
1543 i += 1
1544 mypos += 1
1545 end
1546 var ns = new CString(sl + 1)
1547 ns[sl] = 0u8
1548 i = 0
1549 var off = 0
1550 while i < mypos do
1551 var tmp = na[i]
1552 if tmp isa FlatString then
1553 var tpl = tmp._byte_length
1554 tmp._items.copy_to(ns, tpl, tmp._first_byte, off)
1555 off += tpl
1556 else
1557 for j in tmp.substrings do
1558 var s = j.as(FlatString)
1559 var slen = s._byte_length
1560 s._items.copy_to(ns, slen, s._first_byte, off)
1561 off += slen
1562 end
1563 end
1564 i += 1
1565 end
1566 return new FlatString.with_infos(ns, sl, 0)
1567 end
1568 end
1569
1570 redef class Map[K,V]
1571 redef fun join(sep, couple_sep)
1572 do
1573 if is_empty then return ""
1574
1575 var s = new Buffer # Result
1576
1577 # Concat first item
1578 var i = iterator
1579 var k = i.key
1580 var e = i.item
1581 s.append("{k or else "<null>"}{couple_sep}{e or else "<null>"}")
1582
1583 # Concat other _items
1584 i.next
1585 while i.is_ok do
1586 s.append(sep)
1587 k = i.key
1588 e = i.item
1589 s.append("{k or else "<null>"}{couple_sep}{e or else "<null>"}")
1590 i.next
1591 end
1592 return s.to_s
1593 end
1594 end