1 # This file is part of NIT ( http://www.nitlanguage.org ).
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
7 # http://www.apache.org/licenses/LICENSE-2.0
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
15 # Introduces UTF-8 as internal encoding for Strings in Nit.
18 intrude import standard
::string
19 intrude import standard
::file
27 #define IS_BIG_ENDIAN (*(uint16_t *)"\0\xff" < 0x100)
31 # UTF-8 char as defined in RFC-3629, e.g. 1-4 Bytes
32 extern class UnicodeChar `{ uint32_t* `}
35 redef type OTHER: UnicodeChar
37 # Transforms a byte-variable char* character to its uint32_t equivalent
38 new from_ns(ns: NativeString, index: Int) `{
39 unsigned char
* ret
= calloc
(1,4);
40 if((ns
[index
] & 0x80) == 0){ memcpy(ret + 3, ns + index, 1); }
41 else if((ns
[index
] & 0xE0) == 0xC0) { memcpy(ret + 2, ns + index, 2); }
42 else if((ns
[index
] & 0xF0) == 0xE0) { memcpy(ret + 1, ns + index, 3); }
43 else if((ns
[index
] & 0xF7) == 0xF0) { memcpy(ret, ns + index, 4); }
44 else{ memcpy(ret + 3, ns + index, 1);}
46 uint32_t tmp
= ntohl
(*((uint32_t
*)ret
));
49 return (uint32_t
*)ret
;
52 # Real length of the char in UTF8
54 # As per the specification :
56 # Length | UTF-8 octet sequence
58 # ---------+-------------------------------------------------
60 # 2 | 110xxxxx 10xxxxxx
61 # 3 | 1110xxxx 10xxxxxx 10xxxxxx
62 # 4 | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
65 if(s
<= 127) {return 1;}
66 if(s
>= 49280 && s
<= 57279) {return 2;}
67 if(s
>= 14712960 && s
<= 15712191) {return 3;}
68 if(s
>= 4034953344 && s
<= 4156538815) { return 4; }
73 # Returns the Unicode code point representing the character
75 # Note : A unicode character might not be a visible glyph, but it will be used to determine canonical equivalence
76 fun code_point: Int import UnicodeChar.len `{
79 switch
(UnicodeChar_len(recv
)){
84 ret
= 0 | ((val
& 0x00001F00) >> 2) | (val
& 0x0000003F);
87 ret
= 0 | ((val
& 0x000F0000) >> 4) | ((val
& 0x00003F00) >> 2) | (val
& 0x0000003F);
90 ret
= 0 | ((val
& 0x07000000) >> 6) | ((val
& 0x003F0000) >> 4) | ((val
& 0x00003F00) >> 2) | (val
& 0x0000003F);
93 unsigned char
* rt
= (unsigned char
*) &ret
;
97 # Warning : This does not follow the Unicode specification for now
99 # TODO: Support Unicode-compliant comparison
100 redef fun <(o) do return self.code_point < o.code_point
102 # Returns an upper-case version of self
104 # NOTE : Works only on ASCII chars
105 # TODO : Support unicode for to_upper
106 fun to_upper: UnicodeChar import UnicodeChar.code_point `{
107 if(*recv
< 97 || *recv
> 122){ return recv; }
108 uint32_t
* ret
= calloc
(1,4);
113 # Returns an lower-case version of self
115 # NOTE : Works only on ASCII chars
116 # TODO : Support unicode for to_upper
117 fun to_lower: UnicodeChar import UnicodeChar.code_point `{
118 if(*recv
< 65 || *recv
> 90){ return recv; }
119 uint32_t
* ret
= calloc
(1,4);
126 if not o isa UnicodeChar then return false
127 if o.code_point == self.code_point then return true
131 redef fun output import UnicodeChar.len `{
132 uint32_t
self = *recv
;
134 uint32_t tmp
= ntohl
(self);
135 memcpy
(&self, &tmp
, 4);
137 unsigned char
* s
= (unsigned char
*) &self;
138 switch
(UnicodeChar_len(recv
)){
143 printf
("%c%c", s
[2], s
[3]);
146 printf
("%c%c%c", s
[1], s
[2], s
[3]);
149 printf
("%c%c%c%c", s
[0], s
[1], s
[2], s
[3]);
154 redef fun to_s: FlatString import FlatString.full, UnicodeChar.len `{
155 int len
= UnicodeChar_len(recv
);
156 char
* r
= malloc
(len
+ 1);
158 uint32_t src
= *recv
;
160 uint32_t tmp
= htonl
(src
);
161 memcpy
(&src
, &tmp
, 4);
163 unsigned char
* s
= (unsigned char
*) &src
;
165 case
1: memcpy
(r
, s
+3, 1); break;
166 case
2: memcpy
(r
, s
+2, 2); break;
167 case
3: memcpy
(r
, s
+1, 3); break;
168 case
4: memcpy
(r
, s
, 4); break;
170 return new_FlatString_full
(r
, 0, len
- 1, len
, 1);
174 # Used to keep track of the last accessed char in a String
176 # The position (as in char) of a String
178 # The position in the NativeString underlying the String
182 class FlatStringReviter
183 super IndexedIterator[UnicodeChar]
185 # The NativeString to iterate upon
186 private var ns: NativeString
188 # The position in the string
191 # The position in the native string
192 private var bytepos: Int
194 init(s: FlatString) do from(s, s.length - 1)
196 init from(s: FlatString, position: Int)
200 bytepos = s.byte_index(position)
206 while ns[bytepos].ascii.bin_and(0xC0) == 0x80 do
212 redef fun index do return pos
214 redef fun item do return new UnicodeChar.from_ns(ns, bytepos)
216 redef fun is_ok do return pos >= 0
220 super IndexedIterator[UnicodeChar]
222 private var ns: NativeString
226 private var bytepos: Int
228 private var slen: Int
230 private var it: UnicodeChar
232 private var is_created: Bool
234 init(s: FlatString) do from(s, 0)
236 init from(s: FlatString, position: Int) do
239 bytepos = s.byte_index(position)
243 redef fun index do return pos
245 redef fun is_ok do return pos < slen
248 if not is_created then
249 it = new UnicodeChar.from_ns(ns, bytepos)
257 if not is_created then
258 it = new UnicodeChar.from_ns(ns, bytepos)
267 redef class FlatString
269 redef type OTHER: FlatString
271 # Length in bytes of the string (e.g. the length of the C string)
272 redef var bytelen: Int
274 # Cache for the last accessed character in the char
275 var cache = new CharCache(-1,-1)
277 redef var length = length_l is lazy
279 private init full(items: NativeString, from, to, bytelen, len: Int)
284 self.bytelen = bytelen
288 # Length implementation
289 private fun length_l: Int import FlatString.items, FlatString.index_to, FlatString.index_from `{
290 char
* ns
= FlatString_items(recv
);
291 int i
= FlatString_index_from(recv
);
292 int max
= FlatString_index_to(recv
);
296 if((c
& 0x80) == 0) { i+= 1; }
297 else if((c
& 0xE0) == 0xC0) { i += 2; }
298 else if((c
& 0xF0) == 0xE0) { i += 3; }
299 else if((c
& 0xF7) == 0xF0) { i += 4; }
310 for i in [0 .. length[ do
311 if o_pos >= olen then return false
312 if char_at(i) > o.char_at(i) then return false
313 if char_at(i) < o.char_at(i) then return true
319 if o == null then return false
320 if not o isa FlatString then return super
322 var itslen = o.length
323 if mylen != itslen then return false
327 while mypos < mylen do
328 if char_at(mypos) != o.char_at(itspos) then return false
335 private fun byte_index(index: Int): Int do
337 assert index < length
339 # Find best insertion point
340 var delta_begin = index
341 var delta_end = (length - 1) - index
342 var delta_cache = (cache.position - index).abs
343 var min = delta_begin
345 if delta_cache < min then min = delta_cache
346 if delta_end < min then min = delta_end
352 if min == delta_begin then
355 else if min == delta_cache then
357 my_i = cache.position
363 while my_i < index do
364 if myits[ns_i].ascii.bin_and(0x80) == 0 then
366 else if myits[ns_i].ascii.bin_and(0xE0) == 0xC0 then
368 else if myits[ns_i].ascii.bin_and(0xF0) == 0xE0 then
370 else if myits[ns_i].ascii.bin_and(0xF7) == 0xF0 then
378 while my_i > index do
379 if myits[ns_i].ascii.bin_and(0xC0) != 0x80 then
381 if my_i == index then break
386 cache.position = index
392 fun char_at(pos: Int): UnicodeChar do
393 return new UnicodeChar.from_ns(items, byte_index(pos))
396 private init with_bytelen(items: NativeString, index_from: Int, index_to: Int, bytelen: Int) do
398 self.index_from = index_from
399 self.index_to = index_to
400 self.bytelen = bytelen
403 redef fun reversed do
404 var new_str = calloc_string(bytelen)
406 var my_pos = index_from
408 for i in [0..length[ do
409 var c = char_at(i).len
411 its.copy_to(new_str, c, my_pos, s_pos)
414 return new FlatString.full(new_str, 0, bytelen - 1, bytelen, length)
417 redef fun to_upper do
418 var ns = calloc_string(bytelen)
420 for i in [0 .. length[
423 c.to_upper.to_s.items.copy_to(ns, c.len, 0, offset)
426 return new FlatString.full(ns, 0, bytelen - 1, bytelen, length)
429 redef fun to_lower do
430 var ns = calloc_string(bytelen)
432 for i in [0 .. length[
435 c.to_lower.to_s.items.copy_to(ns, c.len, 0, offset)
438 return new FlatString.full(ns, 0, bytelen - 1, bytelen, length)
442 if o isa Buffer then o = o.to_s
443 if o isa FlatString then
444 var new_str = calloc_string(bytelen + o.bytelen + 1)
445 var new_bytelen = bytelen + o.bytelen
446 new_str[new_bytelen] = '\0'
447 var newlen = length + o.length
448 items.copy_to(new_str, bytelen, index_from, 0)
449 o.items.copy_to(new_str, o.bytelen, o.index_from, bytelen)
450 return new FlatString.full(new_str, 0, new_bytelen - 1, new_bytelen, newlen)
451 else if o isa RopeString then
452 return new RopeString.from(self) + o
454 # If it goes to this point, that means another String implementation was concerned, therefore you need to support the + operation for this variant
460 var mybtlen = bytelen
461 var new_bytelen = mybtlen * i
463 var newlen = mylen * i
464 var ns = calloc_string(new_bytelen + 1)
465 ns[new_bytelen] = '\0'
468 items.copy_to(ns, bytelen, index_from, offset)
472 return new FlatString.full(ns, 0, new_bytelen - 1, new_bytelen, newlen)
476 redef fun substring(from: Int, count: Int) do
481 if count < 0 then count = 0
485 if count == 0 then return empty
487 var real_from = byte_index(from)
489 var lst = from + count - 1
491 if lst > length - from then
492 return new FlatString.with_bytelen(items, real_from, index_to, index_to - real_from)
495 var real_to = byte_index(lst)
497 return new FlatString.full(items, real_from, real_to, (real_to + char_at(lst).len) - real_from, count)
500 redef fun to_cstring do
501 if real_items != null then return real_items.as(not null)
502 var new_items = calloc_string(bytelen + 1)
503 self.items.copy_to(new_items, bytelen, index_from, 0)
504 new_items[bytelen] = '\0'
505 self.real_items = new_items
512 # Length of the string, in bytes
513 fun bytelen: Int is abstract
517 redef class FlatBuffer
519 redef var bytelen: Int
521 redef init from(s) do
522 if s isa RopeString then
524 for i in s.substrings do self.append(i)
526 items = calloc_string(s.bytelen)
527 if s isa FlatString then
528 s.items.copy_to(items, s.bytelen, s.index_from, 0)
530 s.as(FlatBuffer).items.copy_to(items, s.as(FlatBuffer).bytelen, 0, 0)
537 # Replaces the char at `index
` by `item
`
538 fun char_at=(index: Int, item: UnicodeChar) do
540 if index == length then
544 assert index >= 0 and index < length
545 var ip = byte_at(index)
546 var c = char_at_byte(ip)
547 var size_diff = item.len - c.len
548 if size_diff > 0 then
549 rshift_bytes(ip + c.len, size_diff)
550 else if size_diff < 0 then
551 lshift_bytes(ip + c.len, -size_diff)
554 s.items.copy_to(items, s.bytelen, 0, ip)
557 # Shifts the content of the buffer by `len
` bytes to the right, starting at byte `from
`
558 fun rshift_bytes(from: Int, len: Int) import FlatBuffer.bytelen, FlatBuffer.bytelen=, FlatBuffer.items `{
559 long bt
= FlatBuffer_bytelen(recv
);
560 char
* ns
= FlatBuffer_items(recv
);
561 int off
= from
+ len
;
562 memmove
(ns
+ off
, ns
+ from
, bt
- from
);
563 FlatBuffer_bytelen__assign(recv
, bt
+ len
);
566 # Shifts the content of the buffer by `len
` bytes to the left, starting at `from
`
567 fun lshift_bytes(from: Int, len: Int) import FlatBuffer.bytelen, FlatBuffer.bytelen=, FlatBuffer.items `{
568 long bt
= FlatBuffer_bytelen(recv
);
569 char
* ns
= FlatBuffer_items(recv
);
570 int off
= from
- len
;
571 memmove
(ns
+ off
, ns
+ from
, bt
- from
);
572 FlatBuffer_bytelen__assign(recv
, bt
- len
);
575 # Get the Unicode char stored at `index
` in `self`
576 fun char_at(index: Int): UnicodeChar do return new UnicodeChar.from_ns(items, byte_at(index))
578 # Get the Unicode char stored at `index
` (bytewise) in `self`
579 fun char_at_byte(index: Int): UnicodeChar do return new UnicodeChar.from_ns(items, index)
581 # Add equivalent that supports Unicode
582 fun add_unicode(c: UnicodeChar) do
584 if s.bytelen + bytelen > capacity then enlarge(s.bytelen)
585 s.items.copy_to(items, s.bytelen, 0, bytelen)
588 # Gets the byte index (in NativeString) of the char stored at `i
`
589 fun byte_at(i: Int): Int do
590 assert i < length and i >= 0
594 if items[ns_i].ascii.bin_and(0x80) == 0 then
596 else if items[ns_i].ascii.bin_and(0xE0) == 0xC0 then
598 else if items[ns_i].ascii.bin_and(0xF0) == 0xE0 then
600 else if items[ns_i].ascii.bin_and(0xF7) == 0xF0 then
610 redef fun enlarge(cap) do
612 if cap <= c then return
613 while c <= cap do c = c * 2 + 2
614 var a = calloc_string(c+1)
615 if bytelen > 0 then items.copy_to(a, bytelen, 0, 0)
620 redef fun append(s) do
621 if s isa RopeString then
622 for i in s.substrings do append i
624 var i = s.as(FlatString)
626 var iblen = i.bytelen
627 var newlen = blen + iblen
628 if newlen > capacity then
631 i.items.copy_to(items, iblen, i.index_from, blen)
638 var nns = calloc_string(bytelen)
644 var c = char_at_byte(myp).len
646 ns.copy_to(nns, c, myp, itsp)
657 redef fun copy(s, l, d, ns) do
658 if not d isa FlatBuffer then
659 # This implementation here is only concerned by the FlatBuffer
660 # If you implement a new Buffer subclass, make sure to support this operation via refinement.
664 var re = byte_at(s + l - 1)
666 var rns = d.byte_at(ns)
667 items.copy_to(d.items, rl, rns, rs)
670 redef fun times(i) do
674 if newlen > capacity then enlarge(newlen)
676 items.copy_to(items, len, 0, off)
684 for i in [0 .. length[ do
686 var c = char_at_byte(pos)
688 if c == d then continue
689 d.to_s.items.copy_to(items, 1, 0, pos)
694 for i in [0 .. length[ do
696 var c = char_at_byte(pos)
698 if c == d then continue
699 d.to_s.items.copy_to(items, 1, 0, pos)
703 redef fun to_cstring do
704 var ns = calloc_string(bytelen)
705 items.copy_to(ns, bytelen, 0, 0)
710 redef class NativeString
712 redef fun to_s: FlatString
714 var len = cstring_length
715 return to_s_with_length(len)
718 redef fun to_s_with_length(len: Int): FlatString
720 return new FlatString.with_bytelen(self, 0, len - 1, len)
723 redef fun to_s_with_copy
725 var length = cstring_length
726 var new_self = calloc_string(length + 1)
727 copy_to(new_self, length, 0, 0)
728 return new FlatString.with_bytelen(new_self, 0, length - 1, length)
736 if s isa FlatText then
737 write_native(s.to_cstring, s.bytelen)
738 else for i in s.substrings do write_native(i.to_cstring, i.length)