a6f955ab06b752f0221a48f761188a23d4c2c1a7
[nit.git] / lib / core / text / flat.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # This file is free software, which comes along with NIT. This software is
4 # distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
5 # without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
6 # PARTICULAR PURPOSE. You can modify it is you want, provided this header
7 # is kept unaltered, and a notification of the changes is added.
8 # You are allowed to redistribute it and sell it, alone or is a part of
9 # another product.
10
11 # All the array-based text representations
12 module flat
13
14 intrude import abstract_text
15 intrude import native
16
17 `{
18 #include <stdio.h>
19 #include <string.h>
20 `}
21
22 private class FlatSubstringsIter
23 super Iterator[FlatText]
24
25 var tgt: nullable FlatText
26
27 redef fun item do
28 assert is_ok
29 return tgt.as(not null)
30 end
31
32 redef fun is_ok do return tgt != null
33
34 redef fun next do tgt = null
35 end
36
37 redef class FlatText
38
39 # First byte of the NativeString
40 protected fun first_byte: Int do return 0
41
42 # Last byte of the NativeString
43 protected fun last_byte: Int do return first_byte + _byte_length - 1
44
45 # Cache of the latest position (char) explored in the string
46 var position: Int = 0
47
48 # Cached position (bytes) in the NativeString underlying the String
49 var bytepos: Int = 0
50
51 # Index of the character `index` in `_items`
52 fun char_to_byte_index(index: Int): Int do
53 var dpos = index - _position
54 var b = _bytepos
55 var its = _items
56
57 if dpos == 1 then
58 if its[b] & 0x80u8 == 0x00u8 then
59 b += 1
60 else
61 b += its.length_of_char_at(b)
62 end
63 _bytepos = b
64 _position = index
65 return b
66 end
67 if dpos == -1 then
68 b = its.find_beginning_of_char_at(b - 1)
69 _bytepos = b
70 _position = index
71 return b
72 end
73 if dpos == 0 then return b
74
75 var ln = _length
76 var pos = _position
77 # Find best insertion point
78 var delta_begin = index
79 var delta_end = (ln - 1) - index
80 var delta_cache = (pos - index).abs
81 var min = delta_begin
82
83 if delta_cache < min then min = delta_cache
84 if delta_end < min then min = delta_end
85
86 var ns_i: Int
87 var my_i: Int
88
89 if min == delta_cache then
90 ns_i = _bytepos
91 my_i = pos
92 else if min == delta_begin then
93 ns_i = first_byte
94 my_i = 0
95 else
96 ns_i = its.find_beginning_of_char_at(last_byte)
97 my_i = _length - 1
98 end
99
100 ns_i = its.char_to_byte_index_cached(index, my_i, ns_i)
101
102 _position = index
103 _bytepos = ns_i
104
105 return ns_i
106 end
107
108 # By escaping `self` to HTML, how many more bytes will be needed ?
109 fun chars_to_html_escape: Int do
110 var its = _items
111 var max = last_byte
112 var pos = first_byte
113 var endlen = 0
114 while pos <= max do
115 var c = its[pos]
116 if c == 0x3Cu8 then
117 endlen += 3
118 else if c == 0x3Eu8 then
119 endlen += 3
120 else if c == 0x26u8 then
121 endlen += 4
122 else if c == 0x22u8 then
123 endlen += 4
124 else if c == 0x27u8 then
125 endlen += 4
126 else if c == 0x2Fu8 then
127 endlen += 4
128 end
129 pos += 1
130 end
131 return endlen
132 end
133
134 redef fun html_escape
135 do
136 var extra = chars_to_html_escape
137 if extra == 0 then return to_s
138 var its = _items
139 var max = last_byte
140 var pos = first_byte
141 var nlen = extra + _byte_length
142 var nits = new NativeString(nlen)
143 var outpos = 0
144 while pos <= max do
145 var c = its[pos]
146 # Special codes:
147 # Some HTML characters are used as meta-data, they need
148 # to be replaced by an HTML-Escaped equivalent
149 #
150 # * 0x3C (<) => &lt;
151 # * 0x3E (>) => &gt;
152 # * 0x26 (&) => &amp;
153 # * 0x22 (") => &#34;
154 # * 0x27 (') => &#39;
155 # * 0x2F (/) => &#47;
156 if c == 0x3Cu8 then
157 nits[outpos] = 0x26u8
158 nits[outpos + 1] = 0x6Cu8
159 nits[outpos + 2] = 0x74u8
160 nits[outpos + 3] = 0x3Bu8
161 outpos += 4
162 else if c == 0x3Eu8 then
163 nits[outpos] = 0x26u8
164 nits[outpos + 1] = 0x67u8
165 nits[outpos + 2] = 0x74u8
166 nits[outpos + 3] = 0x3Bu8
167 outpos += 4
168 else if c == 0x26u8 then
169 nits[outpos] = 0x26u8
170 nits[outpos + 1] = 0x61u8
171 nits[outpos + 2] = 0x6Du8
172 nits[outpos + 3] = 0x70u8
173 nits[outpos + 4] = 0x3Bu8
174 outpos += 5
175 else if c == 0x22u8 then
176 nits[outpos] = 0x26u8
177 nits[outpos + 1] = 0x23u8
178 nits[outpos + 2] = 0x33u8
179 nits[outpos + 3] = 0x34u8
180 nits[outpos + 4] = 0x3Bu8
181 outpos += 5
182 else if c == 0x27u8 then
183 nits[outpos] = 0x26u8
184 nits[outpos + 1] = 0x23u8
185 nits[outpos + 2] = 0x33u8
186 nits[outpos + 3] = 0x39u8
187 nits[outpos + 4] = 0x3Bu8
188 outpos += 5
189 else if c == 0x2Fu8 then
190 nits[outpos] = 0x26u8
191 nits[outpos + 1] = 0x23u8
192 nits[outpos + 2] = 0x34u8
193 nits[outpos + 3] = 0x37u8
194 nits[outpos + 4] = 0x3Bu8
195 outpos += 5
196 else
197 nits[outpos] = c
198 outpos += 1
199 end
200 pos += 1
201 end
202 var s = new FlatString.with_infos(nits, nlen, 0)
203 return s
204 end
205
206 # By escaping `self` to C, how many more bytes will be needed ?
207 #
208 # This enables a double-optimization in `escape_to_c` since if this
209 # method returns 0, then `self` does not need escaping and can be
210 # returned as-is
211 fun chars_to_escape_to_c: Int do
212 var its = _items
213 var max = last_byte
214 var pos = first_byte
215 var req_esc = 0
216 while pos <= max do
217 var c = its[pos]
218 if c == 0x0Au8 then
219 req_esc += 1
220 else if c == 0x09u8 then
221 req_esc += 1
222 else if c == 0x22u8 then
223 req_esc += 1
224 else if c == 0x27u8 then
225 req_esc += 1
226 else if c == 0x5Cu8 then
227 req_esc += 1
228 else if c == 0x3Fu8 then
229 var j = pos + 1
230 if j < length then
231 var next = its[j]
232 # We ignore `??'` because it will be escaped as `??\'`.
233 if
234 next == 0x21u8 or
235 next == 0x28u8 or
236 next == 0x29u8 or
237 next == 0x2Du8 or
238 next == 0x2Fu8 or
239 next == 0x3Cu8 or
240 next == 0x3Du8 or
241 next == 0x3Eu8
242 then req_esc += 1
243 end
244 else if c < 32u8 then
245 req_esc += 3
246 end
247 pos += 1
248 end
249 return req_esc
250 end
251
252 redef fun escape_to_c do
253 var ln_extra = chars_to_escape_to_c
254 if ln_extra == 0 then return self.to_s
255 var its = _items
256 var max = last_byte
257 var nlen = _byte_length + ln_extra
258 var nns = new NativeString(nlen)
259 var pos = first_byte
260 var opos = 0
261 while pos <= max do
262 var c = its[pos]
263 # Special codes:
264 #
265 # Any byte with value < 32 is a control character
266 # All their uses will be replaced by their octal
267 # value in C.
268 #
269 # There are two exceptions however:
270 #
271 # * 0x09 => \t
272 # * 0x0A => \n
273 #
274 # Aside from the code points above, the following are:
275 #
276 # * 0x22 => \"
277 # * 0x27 => \'
278 # * 0x5C => \\
279 if c == 0x09u8 then
280 nns[opos] = 0x5Cu8
281 nns[opos + 1] = 0x74u8
282 opos += 2
283 else if c == 0x0Au8 then
284 nns[opos] = 0x5Cu8
285 nns[opos + 1] = 0x6Eu8
286 opos += 2
287 else if c == 0x22u8 then
288 nns[opos] = 0x5Cu8
289 nns[opos + 1] = 0x22u8
290 opos += 2
291 else if c == 0x27u8 then
292 nns[opos] = 0x5Cu8
293 nns[opos + 1] = 0x27u8
294 opos += 2
295 else if c == 0x5Cu8 then
296 nns[opos] = 0x5Cu8
297 nns[opos + 1] = 0x5Cu8
298 opos += 2
299 else if c == 0x3Fu8 then
300 var j = pos + 1
301 if j < length then
302 var next = its[j]
303 # We ignore `??'` because it will be escaped as `??\'`.
304 if
305 next == 0x21u8 or
306 next == 0x28u8 or
307 next == 0x29u8 or
308 next == 0x2Du8 or
309 next == 0x2Fu8 or
310 next == 0x3Cu8 or
311 next == 0x3Du8 or
312 next == 0x3Eu8
313 then
314 nns[opos] = 0x5Cu8
315 opos += 1
316 end
317 end
318 nns[opos] = 0x3Fu8
319 opos += 1
320 else if c < 32u8 then
321 nns[opos] = 0x5Cu8
322 nns[opos + 1] = 0x30u8
323 nns[opos + 2] = ((c & 0x38u8) >> 3) + 0x30u8
324 nns[opos + 3] = (c & 0x07u8) + 0x30u8
325 opos += 4
326 else
327 nns[opos] = c
328 opos += 1
329 end
330 pos += 1
331 end
332 return nns.to_s_unsafe(nlen)
333 end
334
335 redef fun [](index) do
336 var len = _length
337
338 # Statistically:
339 # * ~70% want the next char
340 # * ~23% want the previous
341 # * ~7% want the same char
342 #
343 # So it makes sense to shortcut early. And early is here.
344 var dpos = index - _position
345 var b = _bytepos
346 if dpos == 1 and index < len - 1 then
347 var its = _items
348 var c = its[b]
349 if c & 0x80u8 == 0x00u8 then
350 # We want the next, and current is easy.
351 # So next is easy to find!
352 b += 1
353 _position = index
354 _bytepos = b
355 # The rest will be done by `dpos==0` bellow.
356 dpos = 0
357 end
358 else if dpos == -1 and index > 1 then
359 var its = _items
360 var c = its[b-1]
361 if c & 0x80u8 == 0x00u8 then
362 # We want the previous, and it is easy.
363 b -= 1
364 dpos = 0
365 _position = index
366 _bytepos = b
367 return c.ascii
368 end
369 end
370 if dpos == 0 then
371 # We know what we want (+0 or +1) just get it now!
372 var its = _items
373 var c = its[b]
374 if c & 0x80u8 == 0x00u8 then return c.ascii
375 return items.char_at(b)
376 end
377
378 assert index >= 0 and index < len
379 return fetch_char_at(index)
380 end
381
382 # Gets a `Char` at `index` in `self`
383 #
384 # WARNING: Use at your own risks as no bound-checking is done
385 fun fetch_char_at(index: Int): Char do
386 var i = char_to_byte_index(index)
387 var items = _items
388 var b = items[i]
389 if b & 0x80u8 == 0x00u8 then return b.ascii
390 return items.char_at(i)
391 end
392
393 # If `self` contains only digits and alpha <= 'f', return the corresponding integer.
394 #
395 # assert "ff".to_hex == 255
396 redef fun to_hex(pos, ln) do
397 var res = 0
398 if pos == null then pos = 0
399 if ln == null then ln = length - pos
400 pos = char_to_byte_index(pos)
401 var its = _items
402 var max = pos + ln
403 for i in [pos .. max[ do
404 res <<= 4
405 res += its[i].ascii.from_hex
406 end
407 return res
408 end
409
410 redef fun copy_to_native(dst, n, src_off, dst_off) do
411 _items.copy_to(dst, n, first_byte + src_off, dst_off)
412 end
413 end
414
415 # Immutable strings of characters.
416 abstract class FlatString
417 super FlatText
418 super String
419
420 # Index at which `self` begins in `_items`, inclusively
421 redef var first_byte is noinit
422
423 redef var chars = new FlatStringCharView(self) is lazy
424
425 redef var bytes = new FlatStringByteView(self) is lazy
426
427 redef fun to_cstring do
428 var blen = _byte_length
429 var new_items = new NativeString(blen + 1)
430 _items.copy_to(new_items, blen, _first_byte, 0)
431 new_items[blen] = 0u8
432 return new_items
433 end
434
435 redef fun reversed do
436 var b = new FlatBuffer.with_capacity(_byte_length + 1)
437 var i = _length - 1
438 while i >= 0 do
439 b.add self.fetch_char_at(i)
440 i -= 1
441 end
442 var s = b.to_s.as(FlatString)
443 s._length = self._length
444 return s
445 end
446
447 redef fun fast_cstring do return _items.fast_cstring(_first_byte)
448
449 redef fun substring(from, count)
450 do
451 if count <= 0 then return ""
452
453 if from < 0 then
454 count += from
455 if count <= 0 then return ""
456 from = 0
457 end
458
459 var ln = _length
460 if (count + from) > ln then count = ln - from
461 if count <= 0 then return ""
462 var end_index = from + count - 1
463 return substring_impl(from, count, end_index)
464 end
465
466 private fun substring_impl(from, count, end_index: Int): String do
467 var cache = _position
468 var dfrom = (cache - from).abs
469 var dend = (end_index - from).abs
470
471 var bytefrom: Int
472 var byteto: Int
473 if dfrom < dend then
474 bytefrom = char_to_byte_index(from)
475 byteto = char_to_byte_index(end_index)
476 else
477 byteto = char_to_byte_index(end_index)
478 bytefrom = char_to_byte_index(from)
479 end
480
481 var its = _items
482 byteto += its.length_of_char_at(byteto) - 1
483
484 var s = new FlatString.full(its, byteto - bytefrom + 1, bytefrom, count)
485 return s
486 end
487
488 redef fun empty do return "".as(FlatString)
489
490 redef fun to_upper
491 do
492 var outstr = new FlatBuffer.with_capacity(self._byte_length + 1)
493
494 var mylen = _length
495 var pos = 0
496
497 while pos < mylen do
498 outstr.add(chars[pos].to_upper)
499 pos += 1
500 end
501
502 return outstr.to_s
503 end
504
505 redef fun to_lower
506 do
507 var outstr = new FlatBuffer.with_capacity(self._byte_length + 1)
508
509 var mylen = _length
510 var pos = 0
511
512 while pos < mylen do
513 outstr.add(chars[pos].to_lower)
514 pos += 1
515 end
516
517 return outstr.to_s
518 end
519
520 redef fun output
521 do
522 for i in chars do i.output
523 end
524
525 ##################################################
526 # String Specific Methods #
527 ##################################################
528
529 # Low-level creation of a new string with minimal data.
530 #
531 # `_items` will be used as is, without copy, to retrieve the characters of the string.
532 # Aliasing issues is the responsibility of the caller.
533 private new with_infos(items: NativeString, byte_length, from: Int)
534 do
535 var len = items.utf8_length(from, byte_length)
536 if byte_length == len then return new ASCIIFlatString.full_data(items, byte_length, from, len)
537 return new UnicodeFlatString.full_data(items, byte_length, from, len)
538 end
539
540 # Low-level creation of a new string with all the data.
541 #
542 # `_items` will be used as is, without copy, to retrieve the characters of the string.
543 # Aliasing issues is the responsibility of the caller.
544 private new full(items: NativeString, byte_length, from, length: Int)
545 do
546 if byte_length == length then return new ASCIIFlatString.full_data(items, byte_length, from, length)
547 return new UnicodeFlatString.full_data(items, byte_length, from, length)
548 end
549
550 redef fun ==(other)
551 do
552 if not other isa FlatText then return super
553
554 if self.object_id == other.object_id then return true
555
556 var my_length = _byte_length
557
558 if other._byte_length != my_length then return false
559
560 var my_index = _first_byte
561 var its_index = other.first_byte
562
563 var last_iteration = my_index + my_length
564
565 var its_items = other._items
566 var my_items = self._items
567
568 while my_index < last_iteration do
569 if my_items[my_index] != its_items[its_index] then return false
570 my_index += 1
571 its_index += 1
572 end
573
574 return true
575 end
576
577 redef fun <(other)
578 do
579 if not other isa FlatText then return super
580
581 if self.object_id == other.object_id then return false
582
583 var myits = _items
584 var itsits = other._items
585
586 var mbt = _byte_length
587 var obt = other.byte_length
588
589 var minln = if mbt < obt then mbt else obt
590 var mst = _first_byte
591 var ost = other.first_byte
592
593 for i in [0 .. minln[ do
594 var my_curr_char = myits[mst]
595 var its_curr_char = itsits[ost]
596
597 if my_curr_char > its_curr_char then return false
598 if my_curr_char < its_curr_char then return true
599
600 mst += 1
601 ost += 1
602 end
603
604 return mbt < obt
605 end
606
607 redef fun +(o) do
608 var s = o.to_s
609 var slen = s.byte_length
610 var mlen = _byte_length
611 var nlen = mlen + slen
612 var mits = _items
613 var mifrom = _first_byte
614 if s isa FlatText then
615 var sits = s._items
616 var sifrom = s.first_byte
617 var ns = new NativeString(nlen + 1)
618 mits.copy_to(ns, mlen, mifrom, 0)
619 sits.copy_to(ns, slen, sifrom, mlen)
620 return new FlatString.full(ns, nlen, 0, _length + o.length)
621 else
622 abort
623 end
624 end
625
626 redef fun *(i) do
627 var mybtlen = _byte_length
628 var new_byte_length = mybtlen * i
629 var mylen = _length
630 var newlen = mylen * i
631 var its = _items
632 var fb = _first_byte
633 var ns = new NativeString(new_byte_length + 1)
634 ns[new_byte_length] = 0u8
635 var offset = 0
636 while i > 0 do
637 its.copy_to(ns, mybtlen, fb, offset)
638 offset += mybtlen
639 i -= 1
640 end
641 return new FlatString.full(ns, new_byte_length, 0, newlen)
642 end
643
644 redef fun hash
645 do
646 if hash_cache == null then
647 # djb2 hash algorithm
648 var h = 5381
649 var i = _first_byte
650
651 var my_items = _items
652 var max = last_byte
653
654 while i <= max do
655 h = (h << 5) + h + my_items[i].to_i
656 i += 1
657 end
658
659 hash_cache = h
660 end
661
662 return hash_cache.as(not null)
663 end
664
665 redef fun substrings do return new FlatSubstringsIter(self)
666 end
667
668 # Regular Nit UTF-8 strings
669 private class UnicodeFlatString
670 super FlatString
671
672 init full_data(items: NativeString, byte_length, from, length: Int) do
673 self._items = items
674 self._length = length
675 self._byte_length = byte_length
676 _first_byte = from
677 _bytepos = from
678 end
679
680 redef fun substring_from(from) do
681 if from >= self._length then return empty
682 if from <= 0 then return self
683 var c = char_to_byte_index(from)
684 var st = c - _first_byte
685 var fln = byte_length - st
686 return new FlatString.full(items, fln, c, _length - from)
687 end
688 end
689
690 # Special cases of String where all the characters are ASCII-based
691 #
692 # Optimizes access operations to O(1) complexity.
693 private class ASCIIFlatString
694 super FlatString
695
696 init full_data(items: NativeString, byte_length, from, length: Int) do
697 self._items = items
698 self._length = length
699 self._byte_length = byte_length
700 _first_byte = from
701 _bytepos = from
702 end
703
704 redef fun [](idx) do
705 assert idx < _byte_length and idx >= 0
706 return _items[idx + _first_byte].ascii
707 end
708
709 redef fun substring(from, count) do
710 var ln = _length
711 if count <= 0 then return ""
712 if (count + from) > ln then count = ln - from
713 if count <= 0 then return ""
714 if from < 0 then
715 count += from
716 if count <= 0 then return ""
717 from = 0
718 end
719 return new ASCIIFlatString.full_data(_items, count, from + _first_byte, count)
720 end
721
722 redef fun reversed do
723 var b = new FlatBuffer.with_capacity(_byte_length + 1)
724 var i = _length - 1
725 while i >= 0 do
726 b.add self[i]
727 i -= 1
728 end
729 var s = b.to_s.as(FlatString)
730 return s
731 end
732
733 redef fun char_to_byte_index(index) do return index + _first_byte
734
735 redef fun substring_impl(from, count, end_index) do
736 return new ASCIIFlatString.full_data(_items, count, from + _first_byte, count)
737 end
738
739 redef fun fetch_char_at(i) do return _items[i + _first_byte].ascii
740 end
741
742 private class FlatStringCharReverseIterator
743 super IndexedIterator[Char]
744
745 var target: FlatString
746
747 var curr_pos: Int
748
749 redef fun is_ok do return curr_pos >= 0
750
751 redef fun item do return target[curr_pos]
752
753 redef fun next do curr_pos -= 1
754
755 redef fun index do return curr_pos
756
757 end
758
759 private class FlatStringCharIterator
760 super IndexedIterator[Char]
761
762 var target: FlatString
763
764 var max: Int is noautoinit
765
766 var curr_pos: Int
767
768 init do max = target._length - 1
769
770 redef fun is_ok do return curr_pos <= max
771
772 redef fun item do return target[curr_pos]
773
774 redef fun next do curr_pos += 1
775
776 redef fun index do return curr_pos
777
778 end
779
780 private class FlatStringCharView
781 super StringCharView
782
783 redef type SELFTYPE: FlatString
784
785 redef fun [](index) do return target[index]
786
787 redef fun iterator_from(start) do return new FlatStringCharIterator(target, start)
788
789 redef fun reverse_iterator_from(start) do return new FlatStringCharReverseIterator(target, start)
790
791 end
792
793 private class FlatStringByteReverseIterator
794 super IndexedIterator[Byte]
795
796 var target: FlatString
797
798 var target_items: NativeString is noautoinit
799
800 var curr_pos: Int
801
802 init
803 do
804 var tgt = target
805 target_items = tgt._items
806 curr_pos += tgt._first_byte
807 end
808
809 redef fun is_ok do return curr_pos >= target._first_byte
810
811 redef fun item do return target_items[curr_pos]
812
813 redef fun next do curr_pos -= 1
814
815 redef fun index do return curr_pos - target._first_byte
816
817 end
818
819 private class FlatStringByteIterator
820 super IndexedIterator[Byte]
821
822 var target: FlatString
823
824 var target_items: NativeString is noautoinit
825
826 var curr_pos: Int
827
828 init
829 do
830 var tgt = target
831 target_items = tgt._items
832 curr_pos += tgt._first_byte
833 end
834
835 redef fun is_ok do return curr_pos <= target.last_byte
836
837 redef fun item do return target_items[curr_pos]
838
839 redef fun next do curr_pos += 1
840
841 redef fun index do return curr_pos - target._first_byte
842
843 end
844
845 private class FlatStringByteView
846 super StringByteView
847
848 redef type SELFTYPE: FlatString
849
850 redef fun [](index)
851 do
852 # Check that the index (+ _first_byte) is not larger than last_byte
853 # In other terms, if the index is valid
854 var target = _target
855 assert index >= 0 and index < target._byte_length
856 var ind = index + target._first_byte
857 return target._items[ind]
858 end
859
860 redef fun iterator_from(start) do return new FlatStringByteIterator(target, start)
861
862 redef fun reverse_iterator_from(start) do return new FlatStringByteReverseIterator(target, start)
863
864 end
865
866 redef class Buffer
867 redef new do return new FlatBuffer
868
869 redef new with_cap(i) do return new FlatBuffer.with_capacity(i)
870 end
871
872 # Mutable strings of characters.
873 class FlatBuffer
874 super FlatText
875 super Buffer
876
877 redef var chars: Sequence[Char] = new FlatBufferCharView(self) is lazy
878
879 redef var bytes = new FlatBufferByteView(self) is lazy
880
881 private var char_cache: Int = -1
882
883 private var byte_cache: Int = -1
884
885 private var capacity = 0
886
887 redef fun fast_cstring do return _items.fast_cstring(0)
888
889 redef fun substrings do return new FlatSubstringsIter(self)
890
891 # Re-copies the `NativeString` into a new one and sets it as the new `Buffer`
892 #
893 # This happens when an operation modifies the current `Buffer` and
894 # the Copy-On-Write flag `written` is set at true.
895 private fun reset do
896 var nns = new NativeString(capacity)
897 if _byte_length != 0 then _items.copy_to(nns, _byte_length, 0, 0)
898 _items = nns
899 written = false
900 end
901
902 # Shifts the content of the buffer by `len` bytes to the right, starting at byte `from`
903 #
904 # Internal only, does not modify _byte_length or length, this is the caller's responsability
905 private fun rshift_bytes(from: Int, len: Int) do
906 var oit = _items
907 var nit = _items
908 var bt = _byte_length
909 if bt + len > capacity then
910 capacity = capacity * 2 + 2
911 nit = new NativeString(capacity)
912 oit.copy_to(nit, 0, 0, from)
913 end
914 oit.copy_to(nit, bt - from, from, from + len)
915 end
916
917 # Shifts the content of the buffer by `len` bytes to the left, starting at `from`
918 #
919 # Internal only, does not modify _byte_length or length, this is the caller's responsability
920 private fun lshift_bytes(from: Int, len: Int) do
921 var it = _items
922 it.copy_to(it, _byte_length - from, from, from - len)
923 end
924
925 redef fun []=(index, item)
926 do
927 assert index >= 0 and index <= _length
928 if written then reset
929 is_dirty = true
930 if index == _length then
931 add item
932 return
933 end
934 var it = _items
935 var ip = it.char_to_byte_index(index)
936 var c = it.char_at(ip)
937 var clen = c.u8char_len
938 var itemlen = item.u8char_len
939 var size_diff = itemlen - clen
940 if size_diff > 0 then
941 rshift_bytes(ip + clen, size_diff)
942 else if size_diff < 0 then
943 lshift_bytes(ip + clen, -size_diff)
944 end
945 _byte_length += size_diff
946 it.set_char_at(ip, item)
947 end
948
949 redef fun add(c)
950 do
951 if written then reset
952 is_dirty = true
953 var clen = c.u8char_len
954 var bt = _byte_length
955 enlarge(bt + clen)
956 _items.set_char_at(bt, c)
957 _byte_length += clen
958 _length += 1
959 end
960
961 redef fun clear do
962 is_dirty = true
963 _byte_length = 0
964 _length = 0
965 if written then
966 _capacity = 16
967 reset
968 end
969 end
970
971 redef fun empty do return new Buffer
972
973 redef fun enlarge(cap)
974 do
975 var c = capacity
976 if cap <= c then return
977 if c <= 16 then c = 16
978 while c <= cap do c = c * 2
979 # The COW flag can be set at false here, since
980 # it does a copy of the current `Buffer`
981 written = false
982 var bln = _byte_length
983 var a = new NativeString(c)
984 if bln > 0 then
985 var it = _items
986 if bln > 0 then it.copy_to(a, bln, 0, 0)
987 end
988 _items = a
989 capacity = c
990 end
991
992 redef fun to_s
993 do
994 written = true
995 var bln = _byte_length
996 if bln == 0 then _items = new NativeString(1)
997 return new FlatString.full(_items, bln, 0, _length)
998 end
999
1000 redef fun to_cstring
1001 do
1002 var bln = _byte_length
1003 var new_native = new NativeString(bln + 1)
1004 new_native[bln] = 0u8
1005 if _length > 0 then _items.copy_to(new_native, bln, 0, 0)
1006 return new_native
1007 end
1008
1009 # Create a new empty string.
1010 init do end
1011
1012 # Low-level creation a new buffer with given data.
1013 #
1014 # `_items` will be used as is, without copy, to store the characters of the buffer.
1015 # Aliasing issues is the responsibility of the caller.
1016 #
1017 # If `_items` is shared, `written` should be set to true after the creation
1018 # so that a modification will do a copy-on-write.
1019 private init with_infos(items: NativeString, capacity, byte_length, length: Int)
1020 do
1021 self._items = items
1022 self.capacity = capacity
1023 self._byte_length = byte_length
1024 self._length = length
1025 end
1026
1027 # Create a new string copied from `s`.
1028 init from(s: Text)
1029 do
1030 _items = new NativeString(s.byte_length)
1031 for i in s.substrings do i._items.copy_to(_items, i._byte_length, first_byte, 0)
1032 _byte_length = s.byte_length
1033 _length = s.length
1034 _capacity = _byte_length
1035 end
1036
1037 # Create a new empty string with a given capacity.
1038 init with_capacity(cap: Int)
1039 do
1040 assert cap >= 0
1041 _items = new NativeString(cap)
1042 capacity = cap
1043 _byte_length = 0
1044 end
1045
1046 redef fun append(s)
1047 do
1048 if s.is_empty then return
1049 is_dirty = true
1050 var sl = s.byte_length
1051 var nln = _byte_length + sl
1052 enlarge(nln)
1053 if s isa FlatText then
1054 s._items.copy_to(_items, sl, s.first_byte, _byte_length)
1055 else
1056 for i in s.substrings do append i
1057 return
1058 end
1059 _byte_length = nln
1060 _length += s.length
1061 end
1062
1063 # Copies the content of self in `dest`
1064 fun copy(start: Int, len: Int, dest: Buffer, new_start: Int)
1065 do
1066 var self_chars = self.chars
1067 var dest_chars = dest.chars
1068 for i in [0..len-1] do
1069 dest_chars[new_start+i] = self_chars[start+i]
1070 end
1071 end
1072
1073 redef fun substring(from, count)
1074 do
1075 assert count >= 0
1076 if from < 0 then from = 0
1077 if (from + count) > _length then count = _length - from
1078 if count <= 0 then return new Buffer
1079 var its = _items
1080 var bytefrom = its.char_to_byte_index(from)
1081 var byteto = its.char_to_byte_index(count + from - 1)
1082 byteto += its.char_at(byteto).u8char_len - 1
1083 var byte_length = byteto - bytefrom + 1
1084 var r_items = new NativeString(byte_length)
1085 its.copy_to(r_items, byte_length, bytefrom, 0)
1086 return new FlatBuffer.with_infos(r_items, byte_length, byte_length, count)
1087 end
1088
1089 redef fun append_substring_impl(s, from, length) do
1090 if length <= 0 then return
1091 if not s isa FlatText then
1092 super
1093 return
1094 end
1095 var bytest = s.char_to_byte_index(from)
1096 var bytend = s.char_to_byte_index(from + length - 1)
1097 var btln = bytend - bytest + 1
1098 enlarge(btln + _byte_length)
1099 s._items.copy_to(_items, btln, bytest, _byte_length)
1100 _byte_length += btln
1101 _length += length
1102 end
1103
1104 redef fun reverse
1105 do
1106 written = false
1107 var ns = new FlatBuffer.with_capacity(capacity)
1108 for i in chars.reverse_iterator do ns.add i
1109 _items = ns._items
1110 end
1111
1112 redef fun times(repeats)
1113 do
1114 var bln = _byte_length
1115 var x = new FlatString.full(_items, bln, 0, _length)
1116 for i in [1 .. repeats[ do
1117 append(x)
1118 end
1119 end
1120
1121 redef fun upper
1122 do
1123 if written then reset
1124 for i in [0 .. _length[ do self[i] = self[i].to_upper
1125 end
1126
1127 redef fun lower
1128 do
1129 if written then reset
1130 for i in [0 .. _length[ do self[i] = self[i].to_lower
1131 end
1132 end
1133
1134 private class FlatBufferByteReverseIterator
1135 super IndexedIterator[Byte]
1136
1137 var target: FlatBuffer
1138
1139 var target_items: NativeString is noautoinit
1140
1141 var curr_pos: Int
1142
1143 init do target_items = target._items
1144
1145 redef fun index do return curr_pos
1146
1147 redef fun is_ok do return curr_pos >= 0
1148
1149 redef fun item do return target_items[curr_pos]
1150
1151 redef fun next do curr_pos -= 1
1152
1153 end
1154
1155 private class FlatBufferByteView
1156 super BufferByteView
1157
1158 redef type SELFTYPE: FlatBuffer
1159
1160 redef fun [](index) do return target._items[index]
1161
1162 redef fun iterator_from(pos) do return new FlatBufferByteIterator(target, pos)
1163
1164 redef fun reverse_iterator_from(pos) do return new FlatBufferByteReverseIterator(target, pos)
1165
1166 end
1167
1168 private class FlatBufferByteIterator
1169 super IndexedIterator[Byte]
1170
1171 var target: FlatBuffer
1172
1173 var target_items: NativeString is noautoinit
1174
1175 var curr_pos: Int
1176
1177 init do target_items = target._items
1178
1179 redef fun index do return curr_pos
1180
1181 redef fun is_ok do return curr_pos < target._byte_length
1182
1183 redef fun item do return target_items[curr_pos]
1184
1185 redef fun next do curr_pos += 1
1186
1187 end
1188
1189 private class FlatBufferCharReverseIterator
1190 super IndexedIterator[Char]
1191
1192 var target: FlatBuffer
1193
1194 var curr_pos: Int
1195
1196 redef fun index do return curr_pos
1197
1198 redef fun is_ok do return curr_pos >= 0
1199
1200 redef fun item do return target[curr_pos]
1201
1202 redef fun next do curr_pos -= 1
1203
1204 end
1205
1206 private class FlatBufferCharView
1207 super BufferCharView
1208
1209 redef type SELFTYPE: FlatBuffer
1210
1211 redef fun [](index) do return target[index]
1212
1213 redef fun []=(index, item)
1214 do
1215 assert index >= 0 and index <= length
1216 if index == length then
1217 add(item)
1218 return
1219 end
1220 target[index] = item
1221 end
1222
1223 redef fun push(c)
1224 do
1225 target.add(c)
1226 end
1227
1228 redef fun add(c)
1229 do
1230 target.add(c)
1231 end
1232
1233 fun enlarge(cap: Int)
1234 do
1235 target.enlarge(cap)
1236 end
1237
1238 redef fun append(s)
1239 do
1240 var s_length = s.length
1241 if target.capacity < s.length then enlarge(s_length + target._length)
1242 for i in s do target.add i
1243 end
1244
1245 redef fun iterator_from(pos) do return new FlatBufferCharIterator(target, pos)
1246
1247 redef fun reverse_iterator_from(pos) do return new FlatBufferCharReverseIterator(target, pos)
1248
1249 end
1250
1251 private class FlatBufferCharIterator
1252 super IndexedIterator[Char]
1253
1254 var target: FlatBuffer
1255
1256 var max: Int is noautoinit
1257
1258 var curr_pos: Int
1259
1260 init do max = target._length - 1
1261
1262 redef fun index do return curr_pos
1263
1264 redef fun is_ok do return curr_pos <= max
1265
1266 redef fun item do return target[curr_pos]
1267
1268 redef fun next do curr_pos += 1
1269
1270 end
1271
1272 redef class NativeString
1273 redef fun to_s
1274 do
1275 return to_s_with_length(cstring_length)
1276 end
1277
1278 redef fun to_s_with_length(length)
1279 do
1280 assert length >= 0
1281 return clean_utf8(length)
1282 end
1283
1284 redef fun to_s_full(byte_length, unilen) do
1285 return new FlatString.full(self, byte_length, 0, unilen)
1286 end
1287
1288 redef fun to_s_unsafe(len) do
1289 if len == null then len = cstring_length
1290 return new FlatString.with_infos(self, len, 0)
1291 end
1292
1293 redef fun to_s_with_copy do return to_s_with_copy_and_length(cstring_length)
1294
1295 # Get a `String` from `length` bytes at `self` copied into Nit memory
1296 fun to_s_with_copy_and_length(length: Int): String
1297 do
1298 var r = clean_utf8(length)
1299 if r.items != self then return r
1300 var new_self = new NativeString(length + 1)
1301 copy_to(new_self, length, 0, 0)
1302 var str = new FlatString.with_infos(new_self, length, 0)
1303 new_self[length] = 0u8
1304 return str
1305 end
1306
1307 # Cleans a NativeString if necessary
1308 fun clean_utf8(len: Int): FlatString do
1309 var replacements: nullable Array[Int] = null
1310 var end_length = len
1311 var pos = 0
1312 var chr_ln = 0
1313 var rem = len
1314 while rem > 0 do
1315 while rem >= 4 do
1316 var i = fetch_4_chars(pos)
1317 if i & 0x80808080 != 0 then break
1318 pos += 4
1319 chr_ln += 4
1320 rem -= 4
1321 end
1322 if rem == 0 then break
1323 var b = self[pos]
1324 if b & 0x80u8 == 0x00u8 then
1325 pos += 1
1326 chr_ln += 1
1327 rem -= 1
1328 continue
1329 end
1330 var nxst = length_of_char_at(pos)
1331 var ok_st: Bool
1332 if nxst == 1 then
1333 ok_st = b & 0x80u8 == 0u8
1334 else if nxst == 2 then
1335 ok_st = b & 0xE0u8 == 0xC0u8
1336 else if nxst == 3 then
1337 ok_st = b & 0xF0u8 == 0xE0u8
1338 else
1339 ok_st = b & 0xF8u8 == 0xF0u8
1340 end
1341 if not ok_st then
1342 if replacements == null then replacements = new Array[Int]
1343 replacements.add pos
1344 end_length += 2
1345 pos += 1
1346 rem -= 1
1347 chr_ln += 1
1348 continue
1349 end
1350 var ok_c: Bool
1351 var c = char_at(pos)
1352 var cp = c.code_point
1353 if nxst == 1 then
1354 ok_c = cp >= 0 and cp <= 0x7F
1355 else if nxst == 2 then
1356 ok_c = cp >= 0x80 and cp <= 0x7FF
1357 else if nxst == 3 then
1358 ok_c = cp >= 0x800 and cp <= 0xFFFF
1359 ok_c = ok_c and not (cp >= 0xD800 and cp <= 0xDFFF) and cp != 0xFFFE and cp != 0xFFFF
1360 else
1361 ok_c = cp >= 0x10000 and cp <= 0x10FFFF
1362 end
1363 if not ok_c then
1364 if replacements == null then replacements = new Array[Int]
1365 replacements.add pos
1366 end_length += 2
1367 pos += 1
1368 chr_ln += 1
1369 rem -= 1
1370 continue
1371 end
1372 var clen = c.u8char_len
1373 pos += clen
1374 rem -= clen
1375 chr_ln += 1
1376 end
1377 var ret = self
1378 if end_length != len then
1379 ret = new NativeString(end_length)
1380 var old_repl = 0
1381 var off = 0
1382 var repls = replacements.as(not null)
1383 var r = repls.items.as(not null)
1384 var imax = repls.length
1385 for i in [0 .. imax[ do
1386 var repl_pos = r[i]
1387 var chkln = repl_pos - old_repl
1388 copy_to(ret, chkln, old_repl, off)
1389 off += chkln
1390 ret[off] = 0xEFu8
1391 ret[off + 1] = 0xBFu8
1392 ret[off + 2] = 0xBDu8
1393 old_repl = repl_pos + 1
1394 off += 3
1395 end
1396 copy_to(ret, len - old_repl, old_repl, off)
1397 end
1398 return new FlatString.full(ret, end_length, 0, chr_ln)
1399 end
1400
1401 # Sets the next bytes at position `pos` to the value of `c`, encoded in UTF-8
1402 #
1403 # Very unsafe, make sure to have room for this char prior to calling this function.
1404 private fun set_char_at(pos: Int, c: Char) do
1405 var cp = c.code_point
1406 if cp < 128 then
1407 self[pos] = cp.to_b
1408 return
1409 end
1410 var ln = c.u8char_len
1411 if ln == 2 then
1412 self[pos] = (0xC0 | ((cp & 0x7C0) >> 6)).to_b
1413 self[pos + 1] = (0x80 | (cp & 0x3F)).to_b
1414 else if ln == 3 then
1415 self[pos] = (0xE0 | ((cp & 0xF000) >> 12)).to_b
1416 self[pos + 1] = (0x80 | ((cp & 0xFC0) >> 6)).to_b
1417 self[pos + 2] = (0x80 | (cp & 0x3F)).to_b
1418 else if ln == 4 then
1419 self[pos] = (0xF0 | ((cp & 0x1C0000) >> 18)).to_b
1420 self[pos + 1] = (0x80 | ((cp & 0x3F000) >> 12)).to_b
1421 self[pos + 2] = (0x80 | ((cp & 0xFC0) >> 6)).to_b
1422 self[pos + 3] = (0x80 | (cp & 0x3F)).to_b
1423 end
1424 end
1425 end
1426
1427 redef class Int
1428 # return displayable int in base 10 and signed
1429 #
1430 # assert 1.to_s == "1"
1431 # assert (-123).to_s == "-123"
1432 redef fun to_s do
1433 # Fast case for common numbers
1434 if self == 0 then return "0"
1435 if self == 1 then return "1"
1436
1437 var nslen = int_to_s_len
1438 var ns = new NativeString(nslen + 1)
1439 ns[nslen] = 0u8
1440 native_int_to_s(ns, nslen + 1)
1441 return new FlatString.full(ns, nslen, 0, nslen)
1442 end
1443 end
1444
1445 redef class Array[E]
1446
1447 # Fast implementation
1448 redef fun plain_to_s
1449 do
1450 var l = _length
1451 if l == 0 then return ""
1452 var its = _items.as(not null)
1453 var first = its[0]
1454 if l == 1 then if first == null then return "" else return first.to_s
1455 var na = new NativeArray[String](l)
1456 var i = 0
1457 var sl = 0
1458 var mypos = 0
1459 while i < l do
1460 var itsi = its[i]
1461 if itsi == null then
1462 i += 1
1463 continue
1464 end
1465 var tmp = itsi.to_s
1466 sl += tmp.byte_length
1467 na[mypos] = tmp
1468 i += 1
1469 mypos += 1
1470 end
1471 var ns = new NativeString(sl + 1)
1472 ns[sl] = 0u8
1473 i = 0
1474 var off = 0
1475 while i < mypos do
1476 var tmp = na[i]
1477 if tmp isa FlatString then
1478 var tpl = tmp._byte_length
1479 tmp._items.copy_to(ns, tpl, tmp._first_byte, off)
1480 off += tpl
1481 else
1482 for j in tmp.substrings do
1483 var s = j.as(FlatString)
1484 var slen = s._byte_length
1485 s._items.copy_to(ns, slen, s._first_byte, off)
1486 off += slen
1487 end
1488 end
1489 i += 1
1490 end
1491 return new FlatString.with_infos(ns, sl, 0)
1492 end
1493 end
1494
1495 redef class NativeArray[E]
1496 redef fun native_to_s do
1497 assert self isa NativeArray[String]
1498 var l = length
1499 var na = self
1500 var i = 0
1501 var sl = 0
1502 var mypos = 0
1503 while i < l do
1504 sl += na[i].byte_length
1505 i += 1
1506 mypos += 1
1507 end
1508 var ns = new NativeString(sl + 1)
1509 ns[sl] = 0u8
1510 i = 0
1511 var off = 0
1512 while i < mypos do
1513 var tmp = na[i]
1514 if tmp isa FlatString then
1515 var tpl = tmp._byte_length
1516 tmp._items.copy_to(ns, tpl, tmp._first_byte, off)
1517 off += tpl
1518 else
1519 for j in tmp.substrings do
1520 var s = j.as(FlatString)
1521 var slen = s._byte_length
1522 s._items.copy_to(ns, slen, s._first_byte, off)
1523 off += slen
1524 end
1525 end
1526 i += 1
1527 end
1528 return new FlatString.with_infos(ns, sl, 0)
1529 end
1530 end
1531
1532 redef class Map[K,V]
1533 redef fun join(sep, couple_sep)
1534 do
1535 if is_empty then return ""
1536
1537 var s = new Buffer # Result
1538
1539 # Concat first item
1540 var i = iterator
1541 var k = i.key
1542 var e = i.item
1543 s.append("{k or else "<null>"}{couple_sep}{e or else "<null>"}")
1544
1545 # Concat other _items
1546 i.next
1547 while i.is_ok do
1548 s.append(sep)
1549 k = i.key
1550 e = i.item
1551 s.append("{k or else "<null>"}{couple_sep}{e or else "<null>"}")
1552 i.next
1553 end
1554 return s.to_s
1555 end
1556 end