547cbf7345d50a803eec4024add61733b51dd13d
[nit.git] / lib / string_experimentations / utf8_noindex.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 # Introduces UTF-8 as internal encoding for Strings in Nit.
16 module utf8_noindex
17
18 intrude import standard::string
19 intrude import standard::file
20
21 in "C Header" `{
22
23 #include <stdio.h>
24 #include <string.h>
25 #include <stdint.h>
26
27 #define IS_BIG_ENDIAN (*(uint16_t *)"\0\xff" < 0x100)
28
29 `}
30
31 # UTF-8 char as defined in RFC-3629, e.g. 1-4 Bytes
32 extern class UnicodeChar `{ uint32_t* `}
33 super Comparable
34
35 redef type OTHER: UnicodeChar
36
37 # Transforms a byte-variable char* character to its uint32_t equivalent
38 new from_ns(ns: NativeString, index: Int) `{
39 unsigned char* ret = calloc(1,4);
40 if((ns[index] & 0x80) == 0){ memcpy(ret + 3, ns + index, 1); }
41 else if((ns[index] & 0xE0) == 0xC0) { memcpy(ret + 2, ns + index, 2); }
42 else if((ns[index] & 0xF0) == 0xE0) { memcpy(ret + 1, ns + index, 3); }
43 else if((ns[index] & 0xF7) == 0xF0) { memcpy(ret, ns + index, 4); }
44 else{ memcpy(ret + 3, ns + index, 1);}
45 if (!IS_BIG_ENDIAN) {
46 uint32_t tmp = ntohl(*((uint32_t*)ret));
47 memcpy(ret, &tmp, 4);
48 }
49 return (uint32_t*)ret;
50 `}
51
52 # Real length of the char in UTF8
53 #
54 # As per the specification :
55 #
56 # ~~~raw
57 # Length | UTF-8 octet sequence
58 # | (binary)
59 # ---------+-------------------------------------------------
60 # 1 | 0xxxxxxx
61 # 2 | 110xxxxx 10xxxxxx
62 # 3 | 1110xxxx 10xxxxxx 10xxxxxx
63 # 4 | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
64 # ~~~
65 fun len: Int `{
66 uint32_t s = *recv;
67 if(s <= 127) {return 1;}
68 if(s >= 49280 && s <= 57279) {return 2;}
69 if(s >= 14712960 && s <= 15712191) {return 3;}
70 if(s >= 4034953344 && s <= 4156538815) { return 4; }
71 // Bad character
72 return 1;
73 `}
74
75 # Returns the Unicode code point representing the character
76 #
77 # Note : A unicode character might not be a visible glyph, but it will be used to determine canonical equivalence
78 fun code_point: Int import UnicodeChar.len `{
79 uint32_t val = *recv;
80 uint32_t ret = 0;
81 switch(UnicodeChar_len(recv)){
82 case 1:
83 ret = *recv;
84 break;
85 case 2:
86 ret = 0 | ((val & 0x00001F00) >> 2) | (val & 0x0000003F);
87 break;
88 case 3:
89 ret = 0 | ((val & 0x000F0000) >> 4) | ((val & 0x00003F00) >> 2) | (val & 0x0000003F);
90 break;
91 case 4:
92 ret = 0 | ((val & 0x07000000) >> 6) | ((val & 0x003F0000) >> 4) | ((val & 0x00003F00) >> 2) | (val & 0x0000003F);
93 break;
94 }
95 unsigned char* rt = (unsigned char*) &ret;
96 return ret;
97 `}
98
99 # Warning : This does not follow the Unicode specification for now
100 #
101 # TODO: Support Unicode-compliant comparison
102 redef fun <(o) do return self.code_point < o.code_point
103
104 # Returns an upper-case version of self
105 #
106 # NOTE : Works only on ASCII chars
107 # TODO : Support unicode for to_upper
108 fun to_upper: UnicodeChar import UnicodeChar.code_point `{
109 if(*recv < 97 || *recv > 122){ return recv; }
110 uint32_t* ret = calloc(1,4);
111 *ret = *recv - 32;
112 return ret;
113 `}
114
115 # Returns an lower-case version of self
116 #
117 # NOTE : Works only on ASCII chars
118 # TODO : Support unicode for to_upper
119 fun to_lower: UnicodeChar import UnicodeChar.code_point `{
120 if(*recv < 65 || *recv > 90){ return recv; }
121 uint32_t* ret = calloc(1,4);
122 *ret = *recv + 32;
123 return ret;
124 `}
125
126 redef fun ==(o)
127 do
128 if not o isa UnicodeChar then return false
129 if o.code_point == self.code_point then return true
130 return false
131 end
132
133 redef fun output import UnicodeChar.len `{
134 uint32_t self = *recv;
135 if(!IS_BIG_ENDIAN){
136 uint32_t tmp = ntohl(self);
137 memcpy(&self, &tmp, 4);
138 }
139 unsigned char* s = (unsigned char*) &self;
140 switch(UnicodeChar_len(recv)){
141 case 1:
142 printf("%c", s[3]);
143 break;
144 case 2:
145 printf("%c%c", s[2], s[3]);
146 break;
147 case 3:
148 printf("%c%c%c", s[1], s[2], s[3]);
149 break;
150 case 4:
151 printf("%c%c%c%c", s[0], s[1], s[2], s[3]);
152 break;
153 }
154 `}
155
156 redef fun to_s: FlatString import FlatString.full, UnicodeChar.len `{
157 int len = UnicodeChar_len(recv);
158 char* r = malloc(len + 1);
159 r[len] = '\0';
160 uint32_t src = *recv;
161 if(!IS_BIG_ENDIAN){
162 uint32_t tmp = htonl(src);
163 memcpy(&src, &tmp, 4);
164 }
165 unsigned char* s = (unsigned char*) &src;
166 switch(len){
167 case 1: memcpy(r, s+3, 1); break;
168 case 2: memcpy(r, s+2, 2); break;
169 case 3: memcpy(r, s+1, 3); break;
170 case 4: memcpy(r, s, 4); break;
171 }
172 return new_FlatString_full(r, 0, len - 1, len, 1);
173 `}
174 end
175
176 # Used to keep track of the last accessed char in a String
177 class CharCache
178 # The position (as in char) of a String
179 var position: Int
180 # The position in the NativeString underlying the String
181 var bytepos: Int
182 end
183
184 class FlatStringReviter
185 super IndexedIterator[UnicodeChar]
186
187 # The NativeString to iterate upon
188 private var ns: NativeString
189
190 # The position in the string
191 private var pos: Int
192
193 # The position in the native string
194 private var bytepos: Int
195
196 init(s: FlatString) do from(s, s.length - 1)
197
198 init from(s: FlatString, position: Int)
199 do
200 ns = s.items
201 pos = position
202 bytepos = s.byte_index(position)
203 end
204
205 redef fun next
206 do
207 bytepos -= 1
208 while ns[bytepos].ascii.bin_and(0xC0) == 0x80 do
209 bytepos -= 1
210 end
211 pos -= 1
212 end
213
214 redef fun index do return pos
215
216 redef fun item do return new UnicodeChar.from_ns(ns, bytepos)
217
218 redef fun is_ok do return pos >= 0
219 end
220
221 class FlatStringIter
222 super IndexedIterator[UnicodeChar]
223
224 private var ns: NativeString
225
226 private var pos: Int
227
228 private var bytepos: Int
229
230 private var slen: Int
231
232 private var it: UnicodeChar
233
234 private var is_created: Bool
235
236 init(s: FlatString) do from(s, 0)
237
238 init from(s: FlatString, position: Int) do
239 ns = s.items
240 pos = position
241 bytepos = s.byte_index(position)
242 slen = s.length
243 end
244
245 redef fun index do return pos
246
247 redef fun is_ok do return pos < slen
248
249 redef fun item do
250 if not is_created then
251 it = new UnicodeChar.from_ns(ns, bytepos)
252 is_created = true
253 end
254 return it
255 end
256
257 redef fun next
258 do
259 if not is_created then
260 it = new UnicodeChar.from_ns(ns, bytepos)
261 end
262 is_created = false
263 var pace = it.len
264 pos += 1
265 bytepos += pace
266 end
267 end
268
269 redef class FlatString
270
271 redef type OTHER: FlatString
272
273 # Length in bytes of the string (e.g. the length of the C string)
274 redef var bytelen: Int
275
276 # Cache for the last accessed character in the char
277 var cache = new CharCache(-1,-1)
278
279 redef var length = length_l is lazy
280
281 private init full(items: NativeString, from, to, bytelen, len: Int)
282 do
283 self.items = items
284 index_from = from
285 index_to = to
286 self.bytelen = bytelen
287 length = len
288 end
289
290 # Length implementation
291 private fun length_l: Int import FlatString.items, FlatString.index_to, FlatString.index_from `{
292 char* ns = FlatString_items(recv);
293 int i = FlatString_index_from(recv);
294 int max = FlatString_index_to(recv);
295 int length = 0;
296 while(i <= max){
297 char c = ns[i];
298 if((c & 0x80) == 0) { i+= 1; }
299 else if((c & 0xE0) == 0xC0) { i += 2; }
300 else if((c & 0xF0) == 0xE0) { i += 3; }
301 else if((c & 0xF7) == 0xF0) { i += 4; }
302 else { i += 1; }
303 length ++;
304 }
305 return length;
306 `}
307
308 redef fun <(o)
309 do
310 var o_pos = 0
311 var olen = o.length
312 for i in [0 .. length[ do
313 if o_pos >= olen then return false
314 if char_at(i) > o.char_at(i) then return false
315 if char_at(i) < o.char_at(i) then return true
316 end
317 return false
318 end
319
320 redef fun ==(o) do
321 if o == null then return false
322 if not o isa FlatString then return super
323 var mylen = length
324 var itslen = o.length
325 if mylen != itslen then return false
326 var mypos = 0
327 var itspos = 0
328
329 while mypos < mylen do
330 if char_at(mypos) != o.char_at(itspos) then return false
331 mypos += 1
332 itspos += 1
333 end
334 return true
335 end
336
337 private fun byte_index(index: Int): Int do
338 assert index >= 0
339 assert index < length
340
341 # Find best insertion point
342 var delta_begin = index
343 var delta_end = (length - 1) - index
344 var delta_cache = (cache.position - index).abs
345 var min = delta_begin
346
347 if delta_cache < min then min = delta_cache
348 if delta_end < min then min = delta_end
349
350 var ns_i: Int
351 var my_i: Int
352 var myits = items
353
354 if min == delta_begin then
355 ns_i = index_from
356 my_i = 0
357 else if min == delta_cache then
358 ns_i = cache.bytepos
359 my_i = cache.position
360 else
361 ns_i = index_to
362 my_i = length
363 end
364
365 while my_i < index do
366 if myits[ns_i].ascii.bin_and(0x80) == 0 then
367 ns_i += 1
368 else if myits[ns_i].ascii.bin_and(0xE0) == 0xC0 then
369 ns_i += 2
370 else if myits[ns_i].ascii.bin_and(0xF0) == 0xE0 then
371 ns_i += 3
372 else if myits[ns_i].ascii.bin_and(0xF7) == 0xF0 then
373 ns_i += 4
374 else
375 ns_i += 1
376 end
377 my_i += 1
378 end
379
380 while my_i > index do
381 if myits[ns_i].ascii.bin_and(0xC0) != 0x80 then
382 my_i -= 1
383 if my_i == index then break
384 end
385 ns_i -= 1
386 end
387
388 cache.position = index
389 cache.bytepos = ns_i
390
391 return ns_i
392 end
393
394 fun char_at(pos: Int): UnicodeChar do
395 return new UnicodeChar.from_ns(items, byte_index(pos))
396 end
397
398 private init with_bytelen(items: NativeString, index_from: Int, index_to: Int, bytelen: Int) do
399 self.items = items
400 self.index_from = index_from
401 self.index_to = index_to
402 self.bytelen = bytelen
403 end
404
405 redef fun reversed do
406 var new_str = new NativeString(bytelen)
407 var s_pos = bytelen
408 var my_pos = index_from
409 var its = items
410 for i in [0..length[ do
411 var c = char_at(i).len
412 s_pos -= c
413 its.copy_to(new_str, c, my_pos, s_pos)
414 my_pos += c
415 end
416 return new FlatString.full(new_str, 0, bytelen - 1, bytelen, length)
417 end
418
419 redef fun to_upper do
420 var ns = new NativeString(bytelen)
421 var offset = 0
422 for i in [0 .. length[
423 do
424 var c = char_at(i)
425 c.to_upper.to_s.items.copy_to(ns, c.len, 0, offset)
426 offset += c.len
427 end
428 return new FlatString.full(ns, 0, bytelen - 1, bytelen, length)
429 end
430
431 redef fun to_lower do
432 var ns = new NativeString(bytelen)
433 var offset = 0
434 for i in [0 .. length[
435 do
436 var c = char_at(i)
437 c.to_lower.to_s.items.copy_to(ns, c.len, 0, offset)
438 offset += c.len
439 end
440 return new FlatString.full(ns, 0, bytelen - 1, bytelen, length)
441 end
442
443 redef fun +(o) do
444 if o isa Buffer then o = o.to_s
445 if o isa FlatString then
446 var new_str = new NativeString(bytelen + o.bytelen + 1)
447 var new_bytelen = bytelen + o.bytelen
448 new_str[new_bytelen] = '\0'
449 var newlen = length + o.length
450 items.copy_to(new_str, bytelen, index_from, 0)
451 o.items.copy_to(new_str, o.bytelen, o.index_from, bytelen)
452 return new FlatString.full(new_str, 0, new_bytelen - 1, new_bytelen, newlen)
453 else if o isa Concat then
454 return new Concat(self, o)
455 else
456 # If it goes to this point, that means another String implementation was concerned, therefore you need to support the + operation for this variant
457 abort
458 end
459 end
460
461 redef fun *(i) do
462 var mybtlen = bytelen
463 var new_bytelen = mybtlen * i
464 var mylen = length
465 var newlen = mylen * i
466 var ns = new NativeString(new_bytelen + 1)
467 ns[new_bytelen] = '\0'
468 var offset = 0
469 while i > 0 do
470 items.copy_to(ns, bytelen, index_from, offset)
471 offset += mybtlen
472 i -= 1
473 end
474 return new FlatString.full(ns, 0, new_bytelen - 1, new_bytelen, newlen)
475 end
476
477 # O(n)
478 redef fun substring(from: Int, count: Int) do
479 assert count >= 0
480
481 if from < 0 then
482 count += from
483 if count < 0 then count = 0
484 from = 0
485 end
486
487 if count == 0 then return empty
488
489 var real_from = byte_index(from)
490
491 var lst = from + count - 1
492
493 if lst > length - from then
494 return new FlatString.with_bytelen(items, real_from, index_to, index_to - real_from)
495 end
496
497 var real_to = byte_index(lst)
498
499 return new FlatString.full(items, real_from, real_to, (real_to + char_at(lst).len) - real_from, count)
500 end
501
502 redef fun to_cstring do
503 if real_items != null then return real_items.as(not null)
504 var new_items = new NativeString(bytelen + 1)
505 self.items.copy_to(new_items, bytelen, index_from, 0)
506 new_items[bytelen] = '\0'
507 self.real_items = new_items
508 return new_items
509 end
510 end
511
512 redef class Text
513
514 # Length of the string, in bytes
515 fun bytelen: Int is abstract
516
517 end
518
519 redef class FlatBuffer
520
521 redef var bytelen: Int
522
523 redef init from(s) do
524 if s isa Concat then
525 with_capacity(50)
526 for i in s.substrings do self.append(i)
527 end
528 items = new NativeString(s.bytelen)
529 if s isa FlatString then
530 s.items.copy_to(items, s.bytelen, s.index_from, 0)
531 else
532 s.as(FlatBuffer).items.copy_to(items, s.as(FlatBuffer).bytelen, 0, 0)
533 end
534 length = s.length
535 bytelen = s.bytelen
536 capacity = s.bytelen
537 end
538
539 # Replaces the char at `index` by `item`
540 fun char_at=(index: Int, item: UnicodeChar) do
541 is_dirty = true
542 if index == length then
543 add_unicode item
544 return
545 end
546 assert index >= 0 and index < length
547 var ip = byte_at(index)
548 var c = char_at_byte(ip)
549 var size_diff = item.len - c.len
550 if size_diff > 0 then
551 rshift_bytes(ip + c.len, size_diff)
552 else if size_diff < 0 then
553 lshift_bytes(ip + c.len, -size_diff)
554 end
555 var s = item.to_s
556 s.items.copy_to(items, s.bytelen, 0, ip)
557 end
558
559 # Shifts the content of the buffer by `len` bytes to the right, starting at byte `from`
560 fun rshift_bytes(from: Int, len: Int) import FlatBuffer.bytelen, FlatBuffer.bytelen=, FlatBuffer.items `{
561 long bt = FlatBuffer_bytelen(recv);
562 char* ns = FlatBuffer_items(recv);
563 int off = from + len;
564 memmove(ns + off, ns + from, bt - from);
565 FlatBuffer_bytelen__assign(recv, bt + len);
566 `}
567
568 # Shifts the content of the buffer by `len` bytes to the left, starting at `from`
569 fun lshift_bytes(from: Int, len: Int) import FlatBuffer.bytelen, FlatBuffer.bytelen=, FlatBuffer.items `{
570 long bt = FlatBuffer_bytelen(recv);
571 char* ns = FlatBuffer_items(recv);
572 int off = from - len;
573 memmove(ns + off, ns + from, bt - from);
574 FlatBuffer_bytelen__assign(recv, bt - len);
575 `}
576
577 # Get the Unicode char stored at `index` in `self`
578 fun char_at(index: Int): UnicodeChar do return new UnicodeChar.from_ns(items, byte_at(index))
579
580 # Get the Unicode char stored at `index` (bytewise) in `self`
581 fun char_at_byte(index: Int): UnicodeChar do return new UnicodeChar.from_ns(items, index)
582
583 # Add equivalent that supports Unicode
584 fun add_unicode(c: UnicodeChar) do
585 var s = c.to_s
586 if s.bytelen + bytelen > capacity then enlarge(s.bytelen)
587 s.items.copy_to(items, s.bytelen, 0, bytelen)
588 end
589
590 # Gets the byte index (in NativeString) of the char stored at `i`
591 fun byte_at(i: Int): Int do
592 assert i < length and i >= 0
593 var ns_i = 0
594 var real_i = 0
595 while real_i < i do
596 if items[ns_i].ascii.bin_and(0x80) == 0 then
597 ns_i += 1
598 else if items[ns_i].ascii.bin_and(0xE0) == 0xC0 then
599 ns_i += 2
600 else if items[ns_i].ascii.bin_and(0xF0) == 0xE0 then
601 ns_i += 3
602 else if items[ns_i].ascii.bin_and(0xF7) == 0xF0 then
603 ns_i += 4
604 else
605 ns_i += 1
606 end
607 real_i += 1
608 end
609 return ns_i
610 end
611
612 redef fun enlarge(cap) do
613 var c = capacity
614 if cap <= c then return
615 while c <= cap do c = c * 2 + 2
616 var a = new NativeString(c+1)
617 if bytelen > 0 then items.copy_to(a, bytelen, 0, 0)
618 items = a
619 capacity = c
620 end
621
622 redef fun append(s) do
623 if s isa Concat then
624 for i in s.substrings do append i
625 end
626 var i = s.as(FlatString)
627 var blen = bytelen
628 var iblen = i.bytelen
629 var newlen = blen + iblen
630 if newlen > capacity then
631 enlarge(newlen)
632 end
633 i.items.copy_to(items, iblen, i.index_from, blen)
634 bytelen += iblen
635 length += i.length
636 end
637
638 redef fun reverse
639 do
640 var nns = new NativeString(bytelen)
641 var ns = items
642 var btlen = bytelen
643 var myp = 0
644 var itsp = btlen
645 while myp < btlen do
646 var c = char_at_byte(myp).len
647 itsp -= c
648 ns.copy_to(nns, c, myp, itsp)
649 myp += c
650 end
651 items = nns
652 end
653
654 redef fun clear do
655 length = 0
656 bytelen = 0
657 end
658
659 redef fun copy(s, l, d, ns) do
660 if not d isa FlatBuffer then
661 # This implementation here is only concerned by the FlatBuffer
662 # If you implement a new Buffer subclass, make sure to support this operation via refinement.
663 abort
664 end
665 var rs = byte_at(s)
666 var re = byte_at(s + l - 1)
667 var rl = re - rs
668 var rns = d.byte_at(ns)
669 items.copy_to(d.items, rl, rns, rs)
670 end
671
672 redef fun times(i) do
673 var len = bytelen
674 var off = len
675 var newlen = len * i
676 if newlen > capacity then enlarge(newlen)
677 for j in [1 .. i[ do
678 items.copy_to(items, len, 0, off)
679 off += len
680 end
681 bytelen = newlen
682 length = length * i
683 end
684
685 redef fun upper do
686 for i in [0 .. length[ do
687 var pos = byte_at(i)
688 var c = char_at_byte(pos)
689 var d = c.to_upper
690 if c == d then continue
691 d.to_s.items.copy_to(items, 1, 0, pos)
692 end
693 end
694
695 redef fun lower do
696 for i in [0 .. length[ do
697 var pos = byte_at(i)
698 var c = char_at_byte(pos)
699 var d = c.to_lower
700 if c == d then continue
701 d.to_s.items.copy_to(items, 1, 0, pos)
702 end
703 end
704
705 redef fun to_cstring do
706 var ns = new NativeString(bytelen)
707 items.copy_to(ns, bytelen, 0, 0)
708 return ns
709 end
710 end
711
712 redef class NativeString
713
714 redef fun to_s: FlatString
715 do
716 var len = cstring_length
717 return to_s_with_length(len)
718 end
719
720 redef fun to_s_with_length(len: Int): FlatString
721 do
722 return new FlatString.with_bytelen(self, 0, len - 1, len)
723 end
724
725 redef fun to_s_with_copy
726 do
727 var length = cstring_length
728 var new_self = new NativeString(length + 1)
729 copy_to(new_self, length, 0, 0)
730 return new FlatString.with_bytelen(new_self, 0, length - 1, length)
731 end
732 end
733
734 redef class OFStream
735 redef fun write(s)
736 do
737 assert is_writable
738 if s isa FlatText then
739 write_native(s.to_cstring, s.bytelen)
740 else for i in s.substrings do write_native(i.to_cstring, i.length)
741 end
742 end