lib: remove class StringCapable
[nit.git] / lib / string_experimentations / utf8_noindex.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 # Introduces UTF-8 as internal encoding for Strings in Nit.
16 module utf8_noindex
17
18 intrude import standard::string
19 intrude import standard::file
20
21 in "C Header" `{
22
23 #include <stdio.h>
24 #include <string.h>
25 #include <stdint.h>
26
27 #define IS_BIG_ENDIAN (*(uint16_t *)"\0\xff" < 0x100)
28
29 `}
30
31 # UTF-8 char as defined in RFC-3629, e.g. 1-4 Bytes
32 extern class UnicodeChar `{ uint32_t* `}
33 super Comparable
34
35 redef type OTHER: UnicodeChar
36
37 # Transforms a byte-variable char* character to its uint32_t equivalent
38 new from_ns(ns: NativeString, index: Int) `{
39 unsigned char* ret = calloc(1,4);
40 if((ns[index] & 0x80) == 0){ memcpy(ret + 3, ns + index, 1); }
41 else if((ns[index] & 0xE0) == 0xC0) { memcpy(ret + 2, ns + index, 2); }
42 else if((ns[index] & 0xF0) == 0xE0) { memcpy(ret + 1, ns + index, 3); }
43 else if((ns[index] & 0xF7) == 0xF0) { memcpy(ret, ns + index, 4); }
44 else{ memcpy(ret + 3, ns + index, 1);}
45 if (!IS_BIG_ENDIAN) {
46 uint32_t tmp = ntohl(*((uint32_t*)ret));
47 memcpy(ret, &tmp, 4);
48 }
49 return (uint32_t*)ret;
50 `}
51
52 # Real length of the char in UTF8
53 #
54 # As per the specification :
55 #
56 # Length | UTF-8 octet sequence
57 # | (binary)
58 # ---------+-------------------------------------------------
59 # 1 | 0xxxxxxx
60 # 2 | 110xxxxx 10xxxxxx
61 # 3 | 1110xxxx 10xxxxxx 10xxxxxx
62 # 4 | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
63 fun len: Int `{
64 uint32_t s = *recv;
65 if(s <= 127) {return 1;}
66 if(s >= 49280 && s <= 57279) {return 2;}
67 if(s >= 14712960 && s <= 15712191) {return 3;}
68 if(s >= 4034953344 && s <= 4156538815) { return 4; }
69 // Bad character
70 return 1;
71 `}
72
73 # Returns the Unicode code point representing the character
74 #
75 # Note : A unicode character might not be a visible glyph, but it will be used to determine canonical equivalence
76 fun code_point: Int import UnicodeChar.len `{
77 uint32_t val = *recv;
78 uint32_t ret = 0;
79 switch(UnicodeChar_len(recv)){
80 case 1:
81 ret = *recv;
82 break;
83 case 2:
84 ret = 0 | ((val & 0x00001F00) >> 2) | (val & 0x0000003F);
85 break;
86 case 3:
87 ret = 0 | ((val & 0x000F0000) >> 4) | ((val & 0x00003F00) >> 2) | (val & 0x0000003F);
88 break;
89 case 4:
90 ret = 0 | ((val & 0x07000000) >> 6) | ((val & 0x003F0000) >> 4) | ((val & 0x00003F00) >> 2) | (val & 0x0000003F);
91 break;
92 }
93 unsigned char* rt = (unsigned char*) &ret;
94 return ret;
95 `}
96
97 # Warning : This does not follow the Unicode specification for now
98 #
99 # TODO: Support Unicode-compliant comparison
100 redef fun <(o) do return self.code_point < o.code_point
101
102 # Returns an upper-case version of self
103 #
104 # NOTE : Works only on ASCII chars
105 # TODO : Support unicode for to_upper
106 fun to_upper: UnicodeChar import UnicodeChar.code_point `{
107 if(*recv < 97 || *recv > 122){ return recv; }
108 uint32_t* ret = calloc(1,4);
109 *ret = *recv - 32;
110 return ret;
111 `}
112
113 # Returns an lower-case version of self
114 #
115 # NOTE : Works only on ASCII chars
116 # TODO : Support unicode for to_upper
117 fun to_lower: UnicodeChar import UnicodeChar.code_point `{
118 if(*recv < 65 || *recv > 90){ return recv; }
119 uint32_t* ret = calloc(1,4);
120 *ret = *recv + 32;
121 return ret;
122 `}
123
124 redef fun ==(o)
125 do
126 if not o isa UnicodeChar then return false
127 if o.code_point == self.code_point then return true
128 return false
129 end
130
131 redef fun output import UnicodeChar.len `{
132 uint32_t self = *recv;
133 if(!IS_BIG_ENDIAN){
134 uint32_t tmp = ntohl(self);
135 memcpy(&self, &tmp, 4);
136 }
137 unsigned char* s = (unsigned char*) &self;
138 switch(UnicodeChar_len(recv)){
139 case 1:
140 printf("%c", s[3]);
141 break;
142 case 2:
143 printf("%c%c", s[2], s[3]);
144 break;
145 case 3:
146 printf("%c%c%c", s[1], s[2], s[3]);
147 break;
148 case 4:
149 printf("%c%c%c%c", s[0], s[1], s[2], s[3]);
150 break;
151 }
152 `}
153
154 redef fun to_s: FlatString import FlatString.full, UnicodeChar.len `{
155 int len = UnicodeChar_len(recv);
156 char* r = malloc(len + 1);
157 r[len] = '\0';
158 uint32_t src = *recv;
159 if(!IS_BIG_ENDIAN){
160 uint32_t tmp = htonl(src);
161 memcpy(&src, &tmp, 4);
162 }
163 unsigned char* s = (unsigned char*) &src;
164 switch(len){
165 case 1: memcpy(r, s+3, 1); break;
166 case 2: memcpy(r, s+2, 2); break;
167 case 3: memcpy(r, s+1, 3); break;
168 case 4: memcpy(r, s, 4); break;
169 }
170 return new_FlatString_full(r, 0, len - 1, len, 1);
171 `}
172 end
173
174 # Used to keep track of the last accessed char in a String
175 class CharCache
176 # The position (as in char) of a String
177 var position: Int
178 # The position in the NativeString underlying the String
179 var bytepos: Int
180 end
181
182 class FlatStringReviter
183 super IndexedIterator[UnicodeChar]
184
185 # The NativeString to iterate upon
186 private var ns: NativeString
187
188 # The position in the string
189 private var pos: Int
190
191 # The position in the native string
192 private var bytepos: Int
193
194 init(s: FlatString) do from(s, s.length - 1)
195
196 init from(s: FlatString, position: Int)
197 do
198 ns = s.items
199 pos = position
200 bytepos = s.byte_index(position)
201 end
202
203 redef fun next
204 do
205 bytepos -= 1
206 while ns[bytepos].ascii.bin_and(0xC0) == 0x80 do
207 bytepos -= 1
208 end
209 pos -= 1
210 end
211
212 redef fun index do return pos
213
214 redef fun item do return new UnicodeChar.from_ns(ns, bytepos)
215
216 redef fun is_ok do return pos >= 0
217 end
218
219 class FlatStringIter
220 super IndexedIterator[UnicodeChar]
221
222 private var ns: NativeString
223
224 private var pos: Int
225
226 private var bytepos: Int
227
228 private var slen: Int
229
230 private var it: UnicodeChar
231
232 private var is_created: Bool
233
234 init(s: FlatString) do from(s, 0)
235
236 init from(s: FlatString, position: Int) do
237 ns = s.items
238 pos = position
239 bytepos = s.byte_index(position)
240 slen = s.length
241 end
242
243 redef fun index do return pos
244
245 redef fun is_ok do return pos < slen
246
247 redef fun item do
248 if not is_created then
249 it = new UnicodeChar.from_ns(ns, bytepos)
250 is_created = true
251 end
252 return it
253 end
254
255 redef fun next
256 do
257 if not is_created then
258 it = new UnicodeChar.from_ns(ns, bytepos)
259 end
260 is_created = false
261 var pace = it.len
262 pos += 1
263 bytepos += pace
264 end
265 end
266
267 redef class FlatString
268
269 redef type OTHER: FlatString
270
271 # Length in bytes of the string (e.g. the length of the C string)
272 redef var bytelen: Int
273
274 # Cache for the last accessed character in the char
275 var cache = new CharCache(-1,-1)
276
277 redef var length = length_l is lazy
278
279 private init full(items: NativeString, from, to, bytelen, len: Int)
280 do
281 self.items = items
282 index_from = from
283 index_to = to
284 self.bytelen = bytelen
285 length = len
286 end
287
288 # Length implementation
289 private fun length_l: Int import FlatString.items, FlatString.index_to, FlatString.index_from `{
290 char* ns = FlatString_items(recv);
291 int i = FlatString_index_from(recv);
292 int max = FlatString_index_to(recv);
293 int length = 0;
294 while(i <= max){
295 char c = ns[i];
296 if((c & 0x80) == 0) { i+= 1; }
297 else if((c & 0xE0) == 0xC0) { i += 2; }
298 else if((c & 0xF0) == 0xE0) { i += 3; }
299 else if((c & 0xF7) == 0xF0) { i += 4; }
300 else { i += 1; }
301 length ++;
302 }
303 return length;
304 `}
305
306 redef fun <(o)
307 do
308 var o_pos = 0
309 var olen = o.length
310 for i in [0 .. length[ do
311 if o_pos >= olen then return false
312 if char_at(i) > o.char_at(i) then return false
313 if char_at(i) < o.char_at(i) then return true
314 end
315 return false
316 end
317
318 redef fun ==(o) do
319 if o == null then return false
320 if not o isa FlatString then return super
321 var mylen = length
322 var itslen = o.length
323 if mylen != itslen then return false
324 var mypos = 0
325 var itspos = 0
326
327 while mypos < mylen do
328 if char_at(mypos) != o.char_at(itspos) then return false
329 mypos += 1
330 itspos += 1
331 end
332 return true
333 end
334
335 private fun byte_index(index: Int): Int do
336 assert index >= 0
337 assert index < length
338
339 # Find best insertion point
340 var delta_begin = index
341 var delta_end = (length - 1) - index
342 var delta_cache = (cache.position - index).abs
343 var min = delta_begin
344
345 if delta_cache < min then min = delta_cache
346 if delta_end < min then min = delta_end
347
348 var ns_i: Int
349 var my_i: Int
350 var myits = items
351
352 if min == delta_begin then
353 ns_i = index_from
354 my_i = 0
355 else if min == delta_cache then
356 ns_i = cache.bytepos
357 my_i = cache.position
358 else
359 ns_i = index_to
360 my_i = length
361 end
362
363 while my_i < index do
364 if myits[ns_i].ascii.bin_and(0x80) == 0 then
365 ns_i += 1
366 else if myits[ns_i].ascii.bin_and(0xE0) == 0xC0 then
367 ns_i += 2
368 else if myits[ns_i].ascii.bin_and(0xF0) == 0xE0 then
369 ns_i += 3
370 else if myits[ns_i].ascii.bin_and(0xF7) == 0xF0 then
371 ns_i += 4
372 else
373 ns_i += 1
374 end
375 my_i += 1
376 end
377
378 while my_i > index do
379 if myits[ns_i].ascii.bin_and(0xC0) != 0x80 then
380 my_i -= 1
381 if my_i == index then break
382 end
383 ns_i -= 1
384 end
385
386 cache.position = index
387 cache.bytepos = ns_i
388
389 return ns_i
390 end
391
392 fun char_at(pos: Int): UnicodeChar do
393 return new UnicodeChar.from_ns(items, byte_index(pos))
394 end
395
396 private init with_bytelen(items: NativeString, index_from: Int, index_to: Int, bytelen: Int) do
397 self.items = items
398 self.index_from = index_from
399 self.index_to = index_to
400 self.bytelen = bytelen
401 end
402
403 redef fun reversed do
404 var new_str = new NativeString(bytelen)
405 var s_pos = bytelen
406 var my_pos = index_from
407 var its = items
408 for i in [0..length[ do
409 var c = char_at(i).len
410 s_pos -= c
411 its.copy_to(new_str, c, my_pos, s_pos)
412 my_pos += c
413 end
414 return new FlatString.full(new_str, 0, bytelen - 1, bytelen, length)
415 end
416
417 redef fun to_upper do
418 var ns = new NativeString(bytelen)
419 var offset = 0
420 for i in [0 .. length[
421 do
422 var c = char_at(i)
423 c.to_upper.to_s.items.copy_to(ns, c.len, 0, offset)
424 offset += c.len
425 end
426 return new FlatString.full(ns, 0, bytelen - 1, bytelen, length)
427 end
428
429 redef fun to_lower do
430 var ns = new NativeString(bytelen)
431 var offset = 0
432 for i in [0 .. length[
433 do
434 var c = char_at(i)
435 c.to_lower.to_s.items.copy_to(ns, c.len, 0, offset)
436 offset += c.len
437 end
438 return new FlatString.full(ns, 0, bytelen - 1, bytelen, length)
439 end
440
441 redef fun +(o) do
442 if o isa Buffer then o = o.to_s
443 if o isa FlatString then
444 var new_str = new NativeString(bytelen + o.bytelen + 1)
445 var new_bytelen = bytelen + o.bytelen
446 new_str[new_bytelen] = '\0'
447 var newlen = length + o.length
448 items.copy_to(new_str, bytelen, index_from, 0)
449 o.items.copy_to(new_str, o.bytelen, o.index_from, bytelen)
450 return new FlatString.full(new_str, 0, new_bytelen - 1, new_bytelen, newlen)
451 else if o isa Concat then
452 return new Concat(self, o)
453 else
454 # If it goes to this point, that means another String implementation was concerned, therefore you need to support the + operation for this variant
455 abort
456 end
457 end
458
459 redef fun *(i) do
460 var mybtlen = bytelen
461 var new_bytelen = mybtlen * i
462 var mylen = length
463 var newlen = mylen * i
464 var ns = new NativeString(new_bytelen + 1)
465 ns[new_bytelen] = '\0'
466 var offset = 0
467 while i > 0 do
468 items.copy_to(ns, bytelen, index_from, offset)
469 offset += mybtlen
470 i -= 1
471 end
472 return new FlatString.full(ns, 0, new_bytelen - 1, new_bytelen, newlen)
473 end
474
475 # O(n)
476 redef fun substring(from: Int, count: Int) do
477 assert count >= 0
478
479 if from < 0 then
480 count += from
481 if count < 0 then count = 0
482 from = 0
483 end
484
485 if count == 0 then return empty
486
487 var real_from = byte_index(from)
488
489 var lst = from + count - 1
490
491 if lst > length - from then
492 return new FlatString.with_bytelen(items, real_from, index_to, index_to - real_from)
493 end
494
495 var real_to = byte_index(lst)
496
497 return new FlatString.full(items, real_from, real_to, (real_to + char_at(lst).len) - real_from, count)
498 end
499
500 redef fun to_cstring do
501 if real_items != null then return real_items.as(not null)
502 var new_items = new NativeString(bytelen + 1)
503 self.items.copy_to(new_items, bytelen, index_from, 0)
504 new_items[bytelen] = '\0'
505 self.real_items = new_items
506 return new_items
507 end
508 end
509
510 redef class Text
511
512 # Length of the string, in bytes
513 fun bytelen: Int is abstract
514
515 end
516
517 redef class FlatBuffer
518
519 redef var bytelen: Int
520
521 redef init from(s) do
522 if s isa Concat then
523 with_capacity(50)
524 for i in s.substrings do self.append(i)
525 end
526 items = new NativeString(s.bytelen)
527 if s isa FlatString then
528 s.items.copy_to(items, s.bytelen, s.index_from, 0)
529 else
530 s.as(FlatBuffer).items.copy_to(items, s.as(FlatBuffer).bytelen, 0, 0)
531 end
532 length = s.length
533 bytelen = s.bytelen
534 capacity = s.bytelen
535 end
536
537 # Replaces the char at `index` by `item`
538 fun char_at=(index: Int, item: UnicodeChar) do
539 is_dirty = true
540 if index == length then
541 add_unicode item
542 return
543 end
544 assert index >= 0 and index < length
545 var ip = byte_at(index)
546 var c = char_at_byte(ip)
547 var size_diff = item.len - c.len
548 if size_diff > 0 then
549 rshift_bytes(ip + c.len, size_diff)
550 else if size_diff < 0 then
551 lshift_bytes(ip + c.len, -size_diff)
552 end
553 var s = item.to_s
554 s.items.copy_to(items, s.bytelen, 0, ip)
555 end
556
557 # Shifts the content of the buffer by `len` bytes to the right, starting at byte `from`
558 fun rshift_bytes(from: Int, len: Int) import FlatBuffer.bytelen, FlatBuffer.bytelen=, FlatBuffer.items `{
559 long bt = FlatBuffer_bytelen(recv);
560 char* ns = FlatBuffer_items(recv);
561 int off = from + len;
562 memmove(ns + off, ns + from, bt - from);
563 FlatBuffer_bytelen__assign(recv, bt + len);
564 `}
565
566 # Shifts the content of the buffer by `len` bytes to the left, starting at `from`
567 fun lshift_bytes(from: Int, len: Int) import FlatBuffer.bytelen, FlatBuffer.bytelen=, FlatBuffer.items `{
568 long bt = FlatBuffer_bytelen(recv);
569 char* ns = FlatBuffer_items(recv);
570 int off = from - len;
571 memmove(ns + off, ns + from, bt - from);
572 FlatBuffer_bytelen__assign(recv, bt - len);
573 `}
574
575 # Get the Unicode char stored at `index` in `self`
576 fun char_at(index: Int): UnicodeChar do return new UnicodeChar.from_ns(items, byte_at(index))
577
578 # Get the Unicode char stored at `index` (bytewise) in `self`
579 fun char_at_byte(index: Int): UnicodeChar do return new UnicodeChar.from_ns(items, index)
580
581 # Add equivalent that supports Unicode
582 fun add_unicode(c: UnicodeChar) do
583 var s = c.to_s
584 if s.bytelen + bytelen > capacity then enlarge(s.bytelen)
585 s.items.copy_to(items, s.bytelen, 0, bytelen)
586 end
587
588 # Gets the byte index (in NativeString) of the char stored at `i`
589 fun byte_at(i: Int): Int do
590 assert i < length and i >= 0
591 var ns_i = 0
592 var real_i = 0
593 while real_i < i do
594 if items[ns_i].ascii.bin_and(0x80) == 0 then
595 ns_i += 1
596 else if items[ns_i].ascii.bin_and(0xE0) == 0xC0 then
597 ns_i += 2
598 else if items[ns_i].ascii.bin_and(0xF0) == 0xE0 then
599 ns_i += 3
600 else if items[ns_i].ascii.bin_and(0xF7) == 0xF0 then
601 ns_i += 4
602 else
603 ns_i += 1
604 end
605 real_i += 1
606 end
607 return ns_i
608 end
609
610 redef fun enlarge(cap) do
611 var c = capacity
612 if cap <= c then return
613 while c <= cap do c = c * 2 + 2
614 var a = new NativeString(c+1)
615 if bytelen > 0 then items.copy_to(a, bytelen, 0, 0)
616 items = a
617 capacity = c
618 end
619
620 redef fun append(s) do
621 if s isa Concat then
622 for i in s.substrings do append i
623 end
624 var i = s.as(FlatString)
625 var blen = bytelen
626 var iblen = i.bytelen
627 var newlen = blen + iblen
628 if newlen > capacity then
629 enlarge(newlen)
630 end
631 i.items.copy_to(items, iblen, i.index_from, blen)
632 bytelen += iblen
633 length += i.length
634 end
635
636 redef fun reverse
637 do
638 var nns = new NativeString(bytelen)
639 var ns = items
640 var btlen = bytelen
641 var myp = 0
642 var itsp = btlen
643 while myp < btlen do
644 var c = char_at_byte(myp).len
645 itsp -= c
646 ns.copy_to(nns, c, myp, itsp)
647 myp += c
648 end
649 items = nns
650 end
651
652 redef fun clear do
653 length = 0
654 bytelen = 0
655 end
656
657 redef fun copy(s, l, d, ns) do
658 if not d isa FlatBuffer then
659 # This implementation here is only concerned by the FlatBuffer
660 # If you implement a new Buffer subclass, make sure to support this operation via refinement.
661 abort
662 end
663 var rs = byte_at(s)
664 var re = byte_at(s + l - 1)
665 var rl = re - rs
666 var rns = d.byte_at(ns)
667 items.copy_to(d.items, rl, rns, rs)
668 end
669
670 redef fun times(i) do
671 var len = bytelen
672 var off = len
673 var newlen = len * i
674 if newlen > capacity then enlarge(newlen)
675 for j in [1 .. i[ do
676 items.copy_to(items, len, 0, off)
677 off += len
678 end
679 bytelen = newlen
680 length = length * i
681 end
682
683 redef fun upper do
684 for i in [0 .. length[ do
685 var pos = byte_at(i)
686 var c = char_at_byte(pos)
687 var d = c.to_upper
688 if c == d then continue
689 d.to_s.items.copy_to(items, 1, 0, pos)
690 end
691 end
692
693 redef fun lower do
694 for i in [0 .. length[ do
695 var pos = byte_at(i)
696 var c = char_at_byte(pos)
697 var d = c.to_lower
698 if c == d then continue
699 d.to_s.items.copy_to(items, 1, 0, pos)
700 end
701 end
702
703 redef fun to_cstring do
704 var ns = new NativeString(bytelen)
705 items.copy_to(ns, bytelen, 0, 0)
706 return ns
707 end
708 end
709
710 redef class NativeString
711
712 redef fun to_s: FlatString
713 do
714 var len = cstring_length
715 return to_s_with_length(len)
716 end
717
718 redef fun to_s_with_length(len: Int): FlatString
719 do
720 return new FlatString.with_bytelen(self, 0, len - 1, len)
721 end
722
723 redef fun to_s_with_copy
724 do
725 var length = cstring_length
726 var new_self = new NativeString(length + 1)
727 copy_to(new_self, length, 0, 0)
728 return new FlatString.with_bytelen(new_self, 0, length - 1, length)
729 end
730 end
731
732 redef class OFStream
733 redef fun write(s)
734 do
735 assert is_writable
736 if s isa FlatText then
737 write_native(s.to_cstring, s.bytelen)
738 else for i in s.substrings do write_native(i.to_cstring, i.length)
739 end
740 end