nit: Added link to `CONTRIBUTING.md` from the README
[nit.git] / lib / core / text / flat.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # This file is free software, which comes along with NIT. This software is
4 # distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
5 # without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
6 # PARTICULAR PURPOSE. You can modify it is you want, provided this header
7 # is kept unaltered, and a notification of the changes is added.
8 # You are allowed to redistribute it and sell it, alone or is a part of
9 # another product.
10
11 # All the array-based text representations
12 module flat
13
14 intrude import abstract_text
15 intrude import native
16
17 `{
18 #include <stdio.h>
19 #include <string.h>
20 `}
21
22 private class FlatSubstringsIter
23 super Iterator[FlatText]
24
25 var tgt: nullable FlatText
26
27 redef fun item do
28 assert is_ok
29 return tgt.as(not null)
30 end
31
32 redef fun is_ok do return tgt != null
33
34 redef fun next do tgt = null
35 end
36
37 redef class FlatText
38
39 # First byte of the NativeString
40 protected fun first_byte: Int do return 0
41
42 # Last byte of the NativeString
43 protected fun last_byte: Int do return first_byte + _bytelen - 1
44
45 # Cache of the latest position (char) explored in the string
46 var position: Int = 0
47
48 # Cached position (bytes) in the NativeString underlying the String
49 var bytepos: Int = 0
50
51 # Index of the character `index` in `_items`
52 fun char_to_byte_index(index: Int): Int do
53 var dpos = index - _position
54 var b = _bytepos
55 var its = _items
56
57 if dpos == 1 then
58 if its[b] & 0x80u8 == 0x00u8 then
59 b += 1
60 else
61 b += its.length_of_char_at(b)
62 end
63 _bytepos = b
64 _position = index
65 return b
66 end
67 if dpos == -1 then
68 b = its.find_beginning_of_char_at(b - 1)
69 _bytepos = b
70 _position = index
71 return b
72 end
73 if dpos == 0 then return b
74
75 var ln = _length
76 var pos = _position
77 # Find best insertion point
78 var delta_begin = index
79 var delta_end = (ln - 1) - index
80 var delta_cache = (pos - index).abs
81 var min = delta_begin
82
83 if delta_cache < min then min = delta_cache
84 if delta_end < min then min = delta_end
85
86 var ns_i: Int
87 var my_i: Int
88
89 if min == delta_cache then
90 ns_i = _bytepos
91 my_i = pos
92 else if min == delta_begin then
93 ns_i = first_byte
94 my_i = 0
95 else
96 ns_i = its.find_beginning_of_char_at(last_byte)
97 my_i = _length - 1
98 end
99
100 ns_i = its.char_to_byte_index_cached(index, my_i, ns_i)
101
102 _position = index
103 _bytepos = ns_i
104
105 return ns_i
106 end
107
108 # By escaping `self` to HTML, how many more bytes will be needed ?
109 fun chars_to_html_escape: Int do
110 var its = _items
111 var max = last_byte
112 var pos = first_byte
113 var endlen = 0
114 while pos <= max do
115 var c = its[pos]
116 if c == 0x3Cu8 then
117 endlen += 3
118 else if c == 0x3Eu8 then
119 endlen += 3
120 else if c == 0x26u8 then
121 endlen += 4
122 else if c == 0x22u8 then
123 endlen += 4
124 else if c == 0x27u8 then
125 endlen += 4
126 else if c == 0x2Fu8 then
127 endlen += 4
128 end
129 pos += 1
130 end
131 return endlen
132 end
133
134 redef fun html_escape
135 do
136 var extra = chars_to_html_escape
137 if extra == 0 then return to_s
138 var its = _items
139 var max = last_byte
140 var pos = first_byte
141 var nlen = extra + _bytelen
142 var nits = new NativeString(nlen)
143 var outpos = 0
144 while pos <= max do
145 var c = its[pos]
146 # Special codes:
147 # Some HTML characters are used as meta-data, they need
148 # to be replaced by an HTML-Escaped equivalent
149 #
150 # * 0x3C (<) => &lt;
151 # * 0x3E (>) => &gt;
152 # * 0x26 (&) => &amp;
153 # * 0x22 (") => &#34;
154 # * 0x27 (') => &#39;
155 # * 0x2F (/) => &#47;
156 if c == 0x3Cu8 then
157 nits[outpos] = 0x26u8
158 nits[outpos + 1] = 0x6Cu8
159 nits[outpos + 2] = 0x74u8
160 nits[outpos + 3] = 0x3Bu8
161 outpos += 4
162 else if c == 0x3Eu8 then
163 nits[outpos] = 0x26u8
164 nits[outpos + 1] = 0x67u8
165 nits[outpos + 2] = 0x74u8
166 nits[outpos + 3] = 0x3Bu8
167 outpos += 4
168 else if c == 0x26u8 then
169 nits[outpos] = 0x26u8
170 nits[outpos + 1] = 0x61u8
171 nits[outpos + 2] = 0x6Du8
172 nits[outpos + 3] = 0x70u8
173 nits[outpos + 4] = 0x3Bu8
174 outpos += 5
175 else if c == 0x22u8 then
176 nits[outpos] = 0x26u8
177 nits[outpos + 1] = 0x23u8
178 nits[outpos + 2] = 0x33u8
179 nits[outpos + 3] = 0x34u8
180 nits[outpos + 4] = 0x3Bu8
181 outpos += 5
182 else if c == 0x27u8 then
183 nits[outpos] = 0x26u8
184 nits[outpos + 1] = 0x23u8
185 nits[outpos + 2] = 0x33u8
186 nits[outpos + 3] = 0x39u8
187 nits[outpos + 4] = 0x3Bu8
188 outpos += 5
189 else if c == 0x2Fu8 then
190 nits[outpos] = 0x26u8
191 nits[outpos + 1] = 0x23u8
192 nits[outpos + 2] = 0x34u8
193 nits[outpos + 3] = 0x37u8
194 nits[outpos + 4] = 0x3Bu8
195 outpos += 5
196 else
197 nits[outpos] = c
198 outpos += 1
199 end
200 pos += 1
201 end
202 var s = new FlatString.with_infos(nits, nlen, 0)
203 return s
204 end
205
206 # By escaping `self` to C, how many more bytes will be needed ?
207 #
208 # This enables a double-optimization in `escape_to_c` since if this
209 # method returns 0, then `self` does not need escaping and can be
210 # returned as-is
211 fun chars_to_escape_to_c: Int do
212 var its = _items
213 var max = last_byte
214 var pos = first_byte
215 var req_esc = 0
216 while pos <= max do
217 var c = its[pos]
218 if c == 0x0Au8 then
219 req_esc += 1
220 else if c == 0x09u8 then
221 req_esc += 1
222 else if c == 0x22u8 then
223 req_esc += 1
224 else if c == 0x27u8 then
225 req_esc += 1
226 else if c == 0x5Cu8 then
227 req_esc += 1
228 else if c < 32u8 then
229 req_esc += 3
230 end
231 pos += 1
232 end
233 return req_esc
234 end
235
236 redef fun escape_to_c do
237 var ln_extra = chars_to_escape_to_c
238 if ln_extra == 0 then return self.to_s
239 var its = _items
240 var max = last_byte
241 var nlen = _bytelen + ln_extra
242 var nns = new NativeString(nlen)
243 var pos = first_byte
244 var opos = 0
245 while pos <= max do
246 var c = its[pos]
247 # Special codes:
248 #
249 # Any byte with value < 32 is a control character
250 # All their uses will be replaced by their octal
251 # value in C.
252 #
253 # There are two exceptions however:
254 #
255 # * 0x09 => \t
256 # * 0x0A => \n
257 #
258 # Aside from the code points above, the following are:
259 #
260 # * 0x22 => \"
261 # * 0x27 => \'
262 # * 0x5C => \\
263 if c == 0x09u8 then
264 nns[opos] = 0x5Cu8
265 nns[opos + 1] = 0x74u8
266 opos += 2
267 else if c == 0x0Au8 then
268 nns[opos] = 0x5Cu8
269 nns[opos + 1] = 0x6Eu8
270 opos += 2
271 else if c == 0x22u8 then
272 nns[opos] = 0x5Cu8
273 nns[opos + 1] = 0x22u8
274 opos += 2
275 else if c == 0x27u8 then
276 nns[opos] = 0x5Cu8
277 nns[opos + 1] = 0x27u8
278 opos += 2
279 else if c == 0x5Cu8 then
280 nns[opos] = 0x5Cu8
281 nns[opos + 1] = 0x5Cu8
282 opos += 2
283 else if c < 32u8 then
284 nns[opos] = 0x5Cu8
285 nns[opos + 1] = 0x30u8
286 nns[opos + 2] = ((c & 0x38u8) >> 3) + 0x30u8
287 nns[opos + 3] = (c & 0x07u8) + 0x30u8
288 opos += 4
289 else
290 nns[opos] = c
291 opos += 1
292 end
293 pos += 1
294 end
295 return nns.to_s_unsafe(nlen)
296 end
297
298 redef fun [](index) do
299 var len = _length
300
301 # Statistically:
302 # * ~70% want the next char
303 # * ~23% want the previous
304 # * ~7% want the same char
305 #
306 # So it makes sense to shortcut early. And early is here.
307 var dpos = index - _position
308 var b = _bytepos
309 if dpos == 1 and index < len - 1 then
310 var its = _items
311 var c = its[b]
312 if c & 0x80u8 == 0x00u8 then
313 # We want the next, and current is easy.
314 # So next is easy to find!
315 b += 1
316 _position = index
317 _bytepos = b
318 # The rest will be done by `dpos==0` bellow.
319 dpos = 0
320 end
321 else if dpos == -1 and index > 1 then
322 var its = _items
323 var c = its[b-1]
324 if c & 0x80u8 == 0x00u8 then
325 # We want the previous, and it is easy.
326 b -= 1
327 dpos = 0
328 _position = index
329 _bytepos = b
330 return c.ascii
331 end
332 end
333 if dpos == 0 then
334 # We know what we want (+0 or +1) just get it now!
335 var its = _items
336 var c = its[b]
337 if c & 0x80u8 == 0x00u8 then return c.ascii
338 return items.char_at(b)
339 end
340
341 assert index >= 0 and index < len
342 return fetch_char_at(index)
343 end
344
345 # Gets a `Char` at `index` in `self`
346 #
347 # WARNING: Use at your own risks as no bound-checking is done
348 fun fetch_char_at(index: Int): Char do
349 var i = char_to_byte_index(index)
350 var items = _items
351 var b = items[i]
352 if b & 0x80u8 == 0x00u8 then return b.ascii
353 return items.char_at(i)
354 end
355
356 # If `self` contains only digits and alpha <= 'f', return the corresponding integer.
357 #
358 # assert "ff".to_hex == 255
359 redef fun to_hex(pos, ln) do
360 var res = 0
361 if pos == null then pos = 0
362 if ln == null then ln = length - pos
363 pos = char_to_byte_index(pos)
364 var its = _items
365 var max = pos + ln
366 for i in [pos .. max[ do
367 res <<= 4
368 res += its[i].ascii.from_hex
369 end
370 return res
371 end
372
373 redef fun copy_to_native(dst, n, src_off, dst_off) do
374 _items.copy_to(dst, n, first_byte + src_off, dst_off)
375 end
376 end
377
378 # Immutable strings of characters.
379 abstract class FlatString
380 super FlatText
381 super String
382
383 # Index at which `self` begins in `_items`, inclusively
384 redef var first_byte is noinit
385
386 redef var chars = new FlatStringCharView(self) is lazy
387
388 redef var bytes = new FlatStringByteView(self) is lazy
389
390 redef var to_cstring is lazy do
391 var blen = _bytelen
392 var new_items = new NativeString(blen + 1)
393 _items.copy_to(new_items, blen, _first_byte, 0)
394 new_items[blen] = 0u8
395 return new_items
396 end
397
398 redef fun reversed do
399 var b = new FlatBuffer.with_capacity(_bytelen + 1)
400 var i = _length - 1
401 while i >= 0 do
402 b.add self.fetch_char_at(i)
403 i -= 1
404 end
405 var s = b.to_s.as(FlatString)
406 s._length = self._length
407 return s
408 end
409
410 redef fun fast_cstring do return _items.fast_cstring(_first_byte)
411
412 redef fun substring(from, count)
413 do
414 if count <= 0 then return ""
415
416 if from < 0 then
417 count += from
418 if count <= 0 then return ""
419 from = 0
420 end
421
422 var ln = _length
423 if (count + from) > ln then count = ln - from
424 if count <= 0 then return ""
425 var end_index = from + count - 1
426 return substring_impl(from, count, end_index)
427 end
428
429 private fun substring_impl(from, count, end_index: Int): String do
430 var cache = _position
431 var dfrom = (cache - from).abs
432 var dend = (end_index - from).abs
433
434 var bytefrom: Int
435 var byteto: Int
436 if dfrom < dend then
437 bytefrom = char_to_byte_index(from)
438 byteto = char_to_byte_index(end_index)
439 else
440 byteto = char_to_byte_index(end_index)
441 bytefrom = char_to_byte_index(from)
442 end
443
444 var its = _items
445 byteto += its.length_of_char_at(byteto) - 1
446
447 var s = new FlatString.full(its, byteto - bytefrom + 1, bytefrom, count)
448 return s
449 end
450
451 redef fun empty do return "".as(FlatString)
452
453 redef fun to_upper
454 do
455 var outstr = new FlatBuffer.with_capacity(self._bytelen + 1)
456
457 var mylen = _length
458 var pos = 0
459
460 while pos < mylen do
461 outstr.add(chars[pos].to_upper)
462 pos += 1
463 end
464
465 return outstr.to_s
466 end
467
468 redef fun to_lower
469 do
470 var outstr = new FlatBuffer.with_capacity(self._bytelen + 1)
471
472 var mylen = _length
473 var pos = 0
474
475 while pos < mylen do
476 outstr.add(chars[pos].to_lower)
477 pos += 1
478 end
479
480 return outstr.to_s
481 end
482
483 redef fun output
484 do
485 for i in chars do i.output
486 end
487
488 ##################################################
489 # String Specific Methods #
490 ##################################################
491
492 # Low-level creation of a new string with minimal data.
493 #
494 # `_items` will be used as is, without copy, to retrieve the characters of the string.
495 # Aliasing issues is the responsibility of the caller.
496 private new with_infos(items: NativeString, bytelen, from: Int)
497 do
498 var len = items.utf8_length(from, bytelen)
499 if bytelen == len then return new ASCIIFlatString.full_data(items, bytelen, from, len)
500 return new UnicodeFlatString.full_data(items, bytelen, from, len)
501 end
502
503 # Low-level creation of a new string with all the data.
504 #
505 # `_items` will be used as is, without copy, to retrieve the characters of the string.
506 # Aliasing issues is the responsibility of the caller.
507 private new full(items: NativeString, bytelen, from, length: Int)
508 do
509 if bytelen == length then return new ASCIIFlatString.full_data(items, bytelen, from, length)
510 return new UnicodeFlatString.full_data(items, bytelen, from, length)
511 end
512
513 redef fun ==(other)
514 do
515 if not other isa FlatText then return super
516
517 if self.object_id == other.object_id then return true
518
519 var my_length = _bytelen
520
521 if other._bytelen != my_length then return false
522
523 var my_index = _first_byte
524 var its_index = other.first_byte
525
526 var last_iteration = my_index + my_length
527
528 var its_items = other._items
529 var my_items = self._items
530
531 while my_index < last_iteration do
532 if my_items[my_index] != its_items[its_index] then return false
533 my_index += 1
534 its_index += 1
535 end
536
537 return true
538 end
539
540 redef fun <(other)
541 do
542 if not other isa FlatText then return super
543
544 if self.object_id == other.object_id then return false
545
546 var myits = _items
547 var itsits = other._items
548
549 var mbt = _bytelen
550 var obt = other.bytelen
551
552 var minln = if mbt < obt then mbt else obt
553 var mst = _first_byte
554 var ost = other.first_byte
555
556 for i in [0 .. minln[ do
557 var my_curr_char = myits[mst]
558 var its_curr_char = itsits[ost]
559
560 if my_curr_char > its_curr_char then return false
561 if my_curr_char < its_curr_char then return true
562
563 mst += 1
564 ost += 1
565 end
566
567 return mbt < obt
568 end
569
570 redef fun +(o) do
571 var s = o.to_s
572 var slen = s.bytelen
573 var mlen = _bytelen
574 var nlen = mlen + slen
575 var mits = _items
576 var mifrom = _first_byte
577 if s isa FlatText then
578 var sits = s._items
579 var sifrom = s.first_byte
580 var ns = new NativeString(nlen + 1)
581 mits.copy_to(ns, mlen, mifrom, 0)
582 sits.copy_to(ns, slen, sifrom, mlen)
583 return new FlatString.full(ns, nlen, 0, _length + o.length)
584 else
585 abort
586 end
587 end
588
589 redef fun *(i) do
590 var mybtlen = _bytelen
591 var new_bytelen = mybtlen * i
592 var mylen = _length
593 var newlen = mylen * i
594 var its = _items
595 var fb = _first_byte
596 var ns = new NativeString(new_bytelen + 1)
597 ns[new_bytelen] = 0u8
598 var offset = 0
599 while i > 0 do
600 its.copy_to(ns, mybtlen, fb, offset)
601 offset += mybtlen
602 i -= 1
603 end
604 return new FlatString.full(ns, new_bytelen, 0, newlen)
605 end
606
607 redef fun hash
608 do
609 if hash_cache == null then
610 # djb2 hash algorithm
611 var h = 5381
612 var i = _first_byte
613
614 var my_items = _items
615 var max = last_byte
616
617 while i <= max do
618 h = (h << 5) + h + my_items[i].to_i
619 i += 1
620 end
621
622 hash_cache = h
623 end
624
625 return hash_cache.as(not null)
626 end
627
628 redef fun substrings do return new FlatSubstringsIter(self)
629 end
630
631 # Regular Nit UTF-8 strings
632 private class UnicodeFlatString
633 super FlatString
634
635 init full_data(items: NativeString, bytelen, from, length: Int) do
636 self._items = items
637 self._length = length
638 self._bytelen = bytelen
639 _first_byte = from
640 _bytepos = from
641 end
642
643 redef fun substring_from(from) do
644 if from >= self._length then return empty
645 if from <= 0 then return self
646 var c = char_to_byte_index(from)
647 var st = c - _first_byte
648 var fln = bytelen - st
649 return new FlatString.full(items, fln, c, _length - from)
650 end
651 end
652
653 # Special cases of String where all the characters are ASCII-based
654 #
655 # Optimizes access operations to O(1) complexity.
656 private class ASCIIFlatString
657 super FlatString
658
659 init full_data(items: NativeString, bytelen, from, length: Int) do
660 self._items = items
661 self._length = length
662 self._bytelen = bytelen
663 _first_byte = from
664 _bytepos = from
665 end
666
667 redef fun [](idx) do
668 assert idx < _bytelen and idx >= 0
669 return _items[idx + _first_byte].ascii
670 end
671
672 redef fun substring(from, count) do
673 var ln = _length
674 if count <= 0 then return ""
675 if (count + from) > ln then count = ln - from
676 if count <= 0 then return ""
677 if from < 0 then
678 count += from
679 if count <= 0 then return ""
680 from = 0
681 end
682 return new ASCIIFlatString.full_data(_items, count, from + _first_byte, count)
683 end
684
685 redef fun reversed do
686 var b = new FlatBuffer.with_capacity(_bytelen + 1)
687 var i = _length - 1
688 while i >= 0 do
689 b.add self[i]
690 i -= 1
691 end
692 var s = b.to_s.as(FlatString)
693 return s
694 end
695
696 redef fun char_to_byte_index(index) do return index + _first_byte
697
698 redef fun substring_impl(from, count, end_index) do
699 return new ASCIIFlatString.full_data(_items, count, from + _first_byte, count)
700 end
701
702 redef fun fetch_char_at(i) do return _items[i + _first_byte].ascii
703 end
704
705 private class FlatStringCharReverseIterator
706 super IndexedIterator[Char]
707
708 var target: FlatString
709
710 var curr_pos: Int
711
712 redef fun is_ok do return curr_pos >= 0
713
714 redef fun item do return target[curr_pos]
715
716 redef fun next do curr_pos -= 1
717
718 redef fun index do return curr_pos
719
720 end
721
722 private class FlatStringCharIterator
723 super IndexedIterator[Char]
724
725 var target: FlatString
726
727 var max: Int is noautoinit
728
729 var curr_pos: Int
730
731 init do max = target._length - 1
732
733 redef fun is_ok do return curr_pos <= max
734
735 redef fun item do return target[curr_pos]
736
737 redef fun next do curr_pos += 1
738
739 redef fun index do return curr_pos
740
741 end
742
743 private class FlatStringCharView
744 super StringCharView
745
746 redef type SELFTYPE: FlatString
747
748 redef fun [](index) do return target[index]
749
750 redef fun iterator_from(start) do return new FlatStringCharIterator(target, start)
751
752 redef fun reverse_iterator_from(start) do return new FlatStringCharReverseIterator(target, start)
753
754 end
755
756 private class FlatStringByteReverseIterator
757 super IndexedIterator[Byte]
758
759 var target: FlatString
760
761 var target_items: NativeString is noautoinit
762
763 var curr_pos: Int
764
765 init
766 do
767 var tgt = target
768 target_items = tgt._items
769 curr_pos += tgt._first_byte
770 end
771
772 redef fun is_ok do return curr_pos >= target._first_byte
773
774 redef fun item do return target_items[curr_pos]
775
776 redef fun next do curr_pos -= 1
777
778 redef fun index do return curr_pos - target._first_byte
779
780 end
781
782 private class FlatStringByteIterator
783 super IndexedIterator[Byte]
784
785 var target: FlatString
786
787 var target_items: NativeString is noautoinit
788
789 var curr_pos: Int
790
791 init
792 do
793 var tgt = target
794 target_items = tgt._items
795 curr_pos += tgt._first_byte
796 end
797
798 redef fun is_ok do return curr_pos <= target.last_byte
799
800 redef fun item do return target_items[curr_pos]
801
802 redef fun next do curr_pos += 1
803
804 redef fun index do return curr_pos - target._first_byte
805
806 end
807
808 private class FlatStringByteView
809 super StringByteView
810
811 redef type SELFTYPE: FlatString
812
813 redef fun [](index)
814 do
815 # Check that the index (+ _first_byte) is not larger than last_byte
816 # In other terms, if the index is valid
817 var target = _target
818 assert index >= 0 and index < target._bytelen
819 var ind = index + target._first_byte
820 return target._items[ind]
821 end
822
823 redef fun iterator_from(start) do return new FlatStringByteIterator(target, start)
824
825 redef fun reverse_iterator_from(start) do return new FlatStringByteReverseIterator(target, start)
826
827 end
828
829 redef class Buffer
830 redef new do return new FlatBuffer
831
832 redef new with_cap(i) do return new FlatBuffer.with_capacity(i)
833 end
834
835 # Mutable strings of characters.
836 class FlatBuffer
837 super FlatText
838 super Buffer
839
840 redef var chars: Sequence[Char] = new FlatBufferCharView(self) is lazy
841
842 redef var bytes = new FlatBufferByteView(self) is lazy
843
844 private var char_cache: Int = -1
845
846 private var byte_cache: Int = -1
847
848 private var capacity = 0
849
850 # Real items, used as cache for when to_cstring is called
851 private var real_items: NativeString is noinit
852
853 redef fun fast_cstring do return _items.fast_cstring(0)
854
855 redef fun substrings do return new FlatSubstringsIter(self)
856
857 # Re-copies the `NativeString` into a new one and sets it as the new `Buffer`
858 #
859 # This happens when an operation modifies the current `Buffer` and
860 # the Copy-On-Write flag `written` is set at true.
861 private fun reset do
862 var nns = new NativeString(capacity)
863 if _bytelen != 0 then _items.copy_to(nns, _bytelen, 0, 0)
864 _items = nns
865 written = false
866 end
867
868 # Shifts the content of the buffer by `len` bytes to the right, starting at byte `from`
869 #
870 # Internal only, does not modify _bytelen or length, this is the caller's responsability
871 private fun rshift_bytes(from: Int, len: Int) do
872 var oit = _items
873 var nit = _items
874 var bt = _bytelen
875 if bt + len > capacity then
876 capacity = capacity * 2 + 2
877 nit = new NativeString(capacity)
878 oit.copy_to(nit, 0, 0, from)
879 end
880 oit.copy_to(nit, bt - from, from, from + len)
881 end
882
883 # Shifts the content of the buffer by `len` bytes to the left, starting at `from`
884 #
885 # Internal only, does not modify _bytelen or length, this is the caller's responsability
886 private fun lshift_bytes(from: Int, len: Int) do
887 var it = _items
888 it.copy_to(it, _bytelen - from, from, from - len)
889 end
890
891 redef fun []=(index, item)
892 do
893 assert index >= 0 and index <= _length
894 if written then reset
895 is_dirty = true
896 if index == _length then
897 add item
898 return
899 end
900 var it = _items
901 var ip = it.char_to_byte_index(index)
902 var c = it.char_at(ip)
903 var clen = c.u8char_len
904 var itemlen = item.u8char_len
905 var size_diff = itemlen - clen
906 if size_diff > 0 then
907 rshift_bytes(ip + clen, size_diff)
908 else if size_diff < 0 then
909 lshift_bytes(ip + clen, -size_diff)
910 end
911 _bytelen += size_diff
912 it.set_char_at(ip, item)
913 end
914
915 redef fun add(c)
916 do
917 if written then reset
918 is_dirty = true
919 var clen = c.u8char_len
920 var bt = _bytelen
921 enlarge(bt + clen)
922 _items.set_char_at(bt, c)
923 _bytelen += clen
924 _length += 1
925 end
926
927 redef fun clear do
928 is_dirty = true
929 _bytelen = 0
930 _length = 0
931 if written then
932 _capacity = 16
933 reset
934 end
935 end
936
937 redef fun empty do return new Buffer
938
939 redef fun enlarge(cap)
940 do
941 var c = capacity
942 if cap <= c then return
943 if c <= 16 then c = 16
944 while c <= cap do c = c * 2
945 # The COW flag can be set at false here, since
946 # it does a copy of the current `Buffer`
947 written = false
948 var bln = _bytelen
949 var a = new NativeString(c)
950 if bln > 0 then
951 var it = _items
952 if bln > 0 then it.copy_to(a, bln, 0, 0)
953 end
954 _items = a
955 capacity = c
956 end
957
958 redef fun to_s
959 do
960 written = true
961 var bln = _bytelen
962 if bln == 0 then _items = new NativeString(1)
963 return new FlatString.full(_items, bln, 0, _length)
964 end
965
966 redef fun to_cstring
967 do
968 if is_dirty then
969 var bln = _bytelen
970 var new_native = new NativeString(bln + 1)
971 new_native[bln] = 0u8
972 if _length > 0 then _items.copy_to(new_native, bln, 0, 0)
973 real_items = new_native
974 is_dirty = false
975 end
976 return real_items
977 end
978
979 # Create a new empty string.
980 init do end
981
982 # Low-level creation a new buffer with given data.
983 #
984 # `_items` will be used as is, without copy, to store the characters of the buffer.
985 # Aliasing issues is the responsibility of the caller.
986 #
987 # If `_items` is shared, `written` should be set to true after the creation
988 # so that a modification will do a copy-on-write.
989 private init with_infos(items: NativeString, capacity, bytelen, length: Int)
990 do
991 self._items = items
992 self.capacity = capacity
993 self._bytelen = bytelen
994 self._length = length
995 end
996
997 # Create a new string copied from `s`.
998 init from(s: Text)
999 do
1000 _items = new NativeString(s.bytelen)
1001 for i in s.substrings do i._items.copy_to(_items, i._bytelen, first_byte, 0)
1002 _bytelen = s.bytelen
1003 _length = s.length
1004 _capacity = _bytelen
1005 end
1006
1007 # Create a new empty string with a given capacity.
1008 init with_capacity(cap: Int)
1009 do
1010 assert cap >= 0
1011 _items = new NativeString(cap)
1012 capacity = cap
1013 _bytelen = 0
1014 end
1015
1016 redef fun append(s)
1017 do
1018 if s.is_empty then return
1019 is_dirty = true
1020 var sl = s.bytelen
1021 var nln = _bytelen + sl
1022 enlarge(nln)
1023 if s isa FlatText then
1024 s._items.copy_to(_items, sl, s.first_byte, _bytelen)
1025 else
1026 for i in s.substrings do append i
1027 return
1028 end
1029 _bytelen = nln
1030 _length += s.length
1031 end
1032
1033 # Copies the content of self in `dest`
1034 fun copy(start: Int, len: Int, dest: Buffer, new_start: Int)
1035 do
1036 var self_chars = self.chars
1037 var dest_chars = dest.chars
1038 for i in [0..len-1] do
1039 dest_chars[new_start+i] = self_chars[start+i]
1040 end
1041 end
1042
1043 redef fun substring(from, count)
1044 do
1045 assert count >= 0
1046 if from < 0 then from = 0
1047 if (from + count) > _length then count = _length - from
1048 if count <= 0 then return new Buffer
1049 var its = _items
1050 var bytefrom = its.char_to_byte_index(from)
1051 var byteto = its.char_to_byte_index(count + from - 1)
1052 byteto += its.char_at(byteto).u8char_len - 1
1053 var byte_length = byteto - bytefrom + 1
1054 var r_items = new NativeString(byte_length)
1055 its.copy_to(r_items, byte_length, bytefrom, 0)
1056 return new FlatBuffer.with_infos(r_items, byte_length, byte_length, count)
1057 end
1058
1059 redef fun append_substring_impl(s, from, length) do
1060 if length <= 0 then return
1061 if not s isa FlatText then
1062 super
1063 return
1064 end
1065 var bytest = s.char_to_byte_index(from)
1066 var bytend = s.char_to_byte_index(from + length - 1)
1067 var btln = bytend - bytest + 1
1068 enlarge(btln + _bytelen)
1069 s._items.copy_to(_items, btln, bytest, _bytelen)
1070 _bytelen += btln
1071 _length += length
1072 end
1073
1074 redef fun reverse
1075 do
1076 written = false
1077 var ns = new FlatBuffer.with_capacity(capacity)
1078 for i in chars.reverse_iterator do ns.add i
1079 _items = ns._items
1080 end
1081
1082 redef fun times(repeats)
1083 do
1084 var bln = _bytelen
1085 var x = new FlatString.full(_items, bln, 0, _length)
1086 for i in [1 .. repeats[ do
1087 append(x)
1088 end
1089 end
1090
1091 redef fun upper
1092 do
1093 if written then reset
1094 for i in [0 .. _length[ do self[i] = self[i].to_upper
1095 end
1096
1097 redef fun lower
1098 do
1099 if written then reset
1100 for i in [0 .. _length[ do self[i] = self[i].to_lower
1101 end
1102 end
1103
1104 private class FlatBufferByteReverseIterator
1105 super IndexedIterator[Byte]
1106
1107 var target: FlatBuffer
1108
1109 var target_items: NativeString is noautoinit
1110
1111 var curr_pos: Int
1112
1113 init do target_items = target._items
1114
1115 redef fun index do return curr_pos
1116
1117 redef fun is_ok do return curr_pos >= 0
1118
1119 redef fun item do return target_items[curr_pos]
1120
1121 redef fun next do curr_pos -= 1
1122
1123 end
1124
1125 private class FlatBufferByteView
1126 super BufferByteView
1127
1128 redef type SELFTYPE: FlatBuffer
1129
1130 redef fun [](index) do return target._items[index]
1131
1132 redef fun iterator_from(pos) do return new FlatBufferByteIterator(target, pos)
1133
1134 redef fun reverse_iterator_from(pos) do return new FlatBufferByteReverseIterator(target, pos)
1135
1136 end
1137
1138 private class FlatBufferByteIterator
1139 super IndexedIterator[Byte]
1140
1141 var target: FlatBuffer
1142
1143 var target_items: NativeString is noautoinit
1144
1145 var curr_pos: Int
1146
1147 init do target_items = target._items
1148
1149 redef fun index do return curr_pos
1150
1151 redef fun is_ok do return curr_pos < target._bytelen
1152
1153 redef fun item do return target_items[curr_pos]
1154
1155 redef fun next do curr_pos += 1
1156
1157 end
1158
1159 private class FlatBufferCharReverseIterator
1160 super IndexedIterator[Char]
1161
1162 var target: FlatBuffer
1163
1164 var curr_pos: Int
1165
1166 redef fun index do return curr_pos
1167
1168 redef fun is_ok do return curr_pos >= 0
1169
1170 redef fun item do return target[curr_pos]
1171
1172 redef fun next do curr_pos -= 1
1173
1174 end
1175
1176 private class FlatBufferCharView
1177 super BufferCharView
1178
1179 redef type SELFTYPE: FlatBuffer
1180
1181 redef fun [](index) do return target[index]
1182
1183 redef fun []=(index, item)
1184 do
1185 assert index >= 0 and index <= length
1186 if index == length then
1187 add(item)
1188 return
1189 end
1190 target[index] = item
1191 end
1192
1193 redef fun push(c)
1194 do
1195 target.add(c)
1196 end
1197
1198 redef fun add(c)
1199 do
1200 target.add(c)
1201 end
1202
1203 fun enlarge(cap: Int)
1204 do
1205 target.enlarge(cap)
1206 end
1207
1208 redef fun append(s)
1209 do
1210 var s_length = s.length
1211 if target.capacity < s.length then enlarge(s_length + target._length)
1212 for i in s do target.add i
1213 end
1214
1215 redef fun iterator_from(pos) do return new FlatBufferCharIterator(target, pos)
1216
1217 redef fun reverse_iterator_from(pos) do return new FlatBufferCharReverseIterator(target, pos)
1218
1219 end
1220
1221 private class FlatBufferCharIterator
1222 super IndexedIterator[Char]
1223
1224 var target: FlatBuffer
1225
1226 var max: Int is noautoinit
1227
1228 var curr_pos: Int
1229
1230 init do max = target._length - 1
1231
1232 redef fun index do return curr_pos
1233
1234 redef fun is_ok do return curr_pos <= max
1235
1236 redef fun item do return target[curr_pos]
1237
1238 redef fun next do curr_pos += 1
1239
1240 end
1241
1242 redef class NativeString
1243 redef fun to_s
1244 do
1245 return to_s_with_length(cstring_length)
1246 end
1247
1248 redef fun to_s_with_length(length)
1249 do
1250 assert length >= 0
1251 return clean_utf8(length)
1252 end
1253
1254 redef fun to_s_full(bytelen, unilen) do
1255 return new FlatString.full(self, bytelen, 0, unilen)
1256 end
1257
1258 redef fun to_s_unsafe(len) do
1259 if len == null then len = cstring_length
1260 return new FlatString.with_infos(self, len, 0)
1261 end
1262
1263 redef fun to_s_with_copy do return to_s_with_copy_and_length(cstring_length)
1264
1265 # Get a `String` from `length` bytes at `self` copied into Nit memory
1266 fun to_s_with_copy_and_length(length: Int): String
1267 do
1268 var r = clean_utf8(length)
1269 if r.items != self then return r
1270 var new_self = new NativeString(length + 1)
1271 copy_to(new_self, length, 0, 0)
1272 var str = new FlatString.with_infos(new_self, length, 0)
1273 new_self[length] = 0u8
1274 str.to_cstring = new_self
1275 return str
1276 end
1277
1278 # Cleans a NativeString if necessary
1279 fun clean_utf8(len: Int): FlatString do
1280 var replacements: nullable Array[Int] = null
1281 var end_length = len
1282 var pos = 0
1283 var chr_ln = 0
1284 var rem = len
1285 while rem > 0 do
1286 while rem >= 4 do
1287 var i = fetch_4_chars(pos)
1288 if i & 0x80808080 != 0 then break
1289 pos += 4
1290 chr_ln += 4
1291 rem -= 4
1292 end
1293 if rem == 0 then break
1294 var b = self[pos]
1295 if b & 0x80u8 == 0x00u8 then
1296 pos += 1
1297 chr_ln += 1
1298 rem -= 1
1299 continue
1300 end
1301 var nxst = length_of_char_at(pos)
1302 var ok_st: Bool
1303 if nxst == 1 then
1304 ok_st = b & 0x80u8 == 0u8
1305 else if nxst == 2 then
1306 ok_st = b & 0xE0u8 == 0xC0u8
1307 else if nxst == 3 then
1308 ok_st = b & 0xF0u8 == 0xE0u8
1309 else
1310 ok_st = b & 0xF8u8 == 0xF0u8
1311 end
1312 if not ok_st then
1313 if replacements == null then replacements = new Array[Int]
1314 replacements.add pos
1315 end_length += 2
1316 pos += 1
1317 rem -= 1
1318 chr_ln += 1
1319 continue
1320 end
1321 var ok_c: Bool
1322 var c = char_at(pos)
1323 var cp = c.code_point
1324 if nxst == 1 then
1325 ok_c = cp >= 0 and cp <= 0x7F
1326 else if nxst == 2 then
1327 ok_c = cp >= 0x80 and cp <= 0x7FF
1328 else if nxst == 3 then
1329 ok_c = cp >= 0x800 and cp <= 0xFFFF
1330 ok_c = ok_c and not (cp >= 0xD800 and cp <= 0xDFFF) and cp != 0xFFFE and cp != 0xFFFF
1331 else
1332 ok_c = cp >= 0x10000 and cp <= 0x10FFFF
1333 end
1334 if not ok_c then
1335 if replacements == null then replacements = new Array[Int]
1336 replacements.add pos
1337 end_length += 2
1338 pos += 1
1339 chr_ln += 1
1340 rem -= 1
1341 continue
1342 end
1343 var clen = c.u8char_len
1344 pos += clen
1345 rem -= clen
1346 chr_ln += 1
1347 end
1348 var ret = self
1349 if end_length != len then
1350 ret = new NativeString(end_length)
1351 var old_repl = 0
1352 var off = 0
1353 var repls = replacements.as(not null)
1354 var r = repls.items.as(not null)
1355 var imax = repls.length
1356 for i in [0 .. imax[ do
1357 var repl_pos = r[i]
1358 var chkln = repl_pos - old_repl
1359 copy_to(ret, chkln, old_repl, off)
1360 off += chkln
1361 ret[off] = 0xEFu8
1362 ret[off + 1] = 0xBFu8
1363 ret[off + 2] = 0xBDu8
1364 old_repl = repl_pos + 1
1365 off += 3
1366 end
1367 copy_to(ret, len - old_repl, old_repl, off)
1368 end
1369 return new FlatString.full(ret, end_length, 0, chr_ln)
1370 end
1371
1372 # Sets the next bytes at position `pos` to the value of `c`, encoded in UTF-8
1373 #
1374 # Very unsafe, make sure to have room for this char prior to calling this function.
1375 private fun set_char_at(pos: Int, c: Char) do
1376 var cp = c.code_point
1377 if cp < 128 then
1378 self[pos] = cp.to_b
1379 return
1380 end
1381 var ln = c.u8char_len
1382 if ln == 2 then
1383 self[pos] = (0xC0 | ((cp & 0x7C0) >> 6)).to_b
1384 self[pos + 1] = (0x80 | (cp & 0x3F)).to_b
1385 else if ln == 3 then
1386 self[pos] = (0xE0 | ((cp & 0xF000) >> 12)).to_b
1387 self[pos + 1] = (0x80 | ((cp & 0xFC0) >> 6)).to_b
1388 self[pos + 2] = (0x80 | (cp & 0x3F)).to_b
1389 else if ln == 4 then
1390 self[pos] = (0xF0 | ((cp & 0x1C0000) >> 18)).to_b
1391 self[pos + 1] = (0x80 | ((cp & 0x3F000) >> 12)).to_b
1392 self[pos + 2] = (0x80 | ((cp & 0xFC0) >> 6)).to_b
1393 self[pos + 3] = (0x80 | (cp & 0x3F)).to_b
1394 end
1395 end
1396 end
1397
1398 redef class Int
1399 # return displayable int in base 10 and signed
1400 #
1401 # assert 1.to_s == "1"
1402 # assert (-123).to_s == "-123"
1403 redef fun to_s do
1404 # Fast case for common numbers
1405 if self == 0 then return "0"
1406 if self == 1 then return "1"
1407
1408 var nslen = int_to_s_len
1409 var ns = new NativeString(nslen + 1)
1410 ns[nslen] = 0u8
1411 native_int_to_s(ns, nslen + 1)
1412 return new FlatString.full(ns, nslen, 0, nslen)
1413 end
1414 end
1415
1416 redef class Array[E]
1417
1418 # Fast implementation
1419 redef fun plain_to_s
1420 do
1421 var l = _length
1422 if l == 0 then return ""
1423 var its = _items.as(not null)
1424 var first = its[0]
1425 if l == 1 then if first == null then return "" else return first.to_s
1426 var na = new NativeArray[String](l)
1427 var i = 0
1428 var sl = 0
1429 var mypos = 0
1430 while i < l do
1431 var itsi = its[i]
1432 if itsi == null then
1433 i += 1
1434 continue
1435 end
1436 var tmp = itsi.to_s
1437 sl += tmp.bytelen
1438 na[mypos] = tmp
1439 i += 1
1440 mypos += 1
1441 end
1442 var ns = new NativeString(sl + 1)
1443 ns[sl] = 0u8
1444 i = 0
1445 var off = 0
1446 while i < mypos do
1447 var tmp = na[i]
1448 if tmp isa FlatString then
1449 var tpl = tmp._bytelen
1450 tmp._items.copy_to(ns, tpl, tmp._first_byte, off)
1451 off += tpl
1452 else
1453 for j in tmp.substrings do
1454 var s = j.as(FlatString)
1455 var slen = s._bytelen
1456 s._items.copy_to(ns, slen, s._first_byte, off)
1457 off += slen
1458 end
1459 end
1460 i += 1
1461 end
1462 return new FlatString.with_infos(ns, sl, 0)
1463 end
1464 end
1465
1466 redef class NativeArray[E]
1467 redef fun native_to_s do
1468 assert self isa NativeArray[String]
1469 var l = length
1470 var na = self
1471 var i = 0
1472 var sl = 0
1473 var mypos = 0
1474 while i < l do
1475 sl += na[i].bytelen
1476 i += 1
1477 mypos += 1
1478 end
1479 var ns = new NativeString(sl + 1)
1480 ns[sl] = 0u8
1481 i = 0
1482 var off = 0
1483 while i < mypos do
1484 var tmp = na[i]
1485 if tmp isa FlatString then
1486 var tpl = tmp._bytelen
1487 tmp._items.copy_to(ns, tpl, tmp._first_byte, off)
1488 off += tpl
1489 else
1490 for j in tmp.substrings do
1491 var s = j.as(FlatString)
1492 var slen = s._bytelen
1493 s._items.copy_to(ns, slen, s._first_byte, off)
1494 off += slen
1495 end
1496 end
1497 i += 1
1498 end
1499 return new FlatString.with_infos(ns, sl, 0)
1500 end
1501 end
1502
1503 redef class Map[K,V]
1504 redef fun join(sep, couple_sep)
1505 do
1506 if is_empty then return ""
1507
1508 var s = new Buffer # Result
1509
1510 # Concat first item
1511 var i = iterator
1512 var k = i.key
1513 var e = i.item
1514 s.append("{k or else "<null>"}{couple_sep}{e or else "<null>"}")
1515
1516 # Concat other _items
1517 i.next
1518 while i.is_ok do
1519 s.append(sep)
1520 k = i.key
1521 e = i.item
1522 s.append("{k or else "<null>"}{couple_sep}{e or else "<null>"}")
1523 i.next
1524 end
1525 return s.to_s
1526 end
1527 end