1 # This file is part of NIT ( http://www.nitlanguage.org ).
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
7 # http://www.apache.org/licenses/LICENSE-2.0
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
15 # Introduces UTF-8 as internal encoding for Strings in Nit.
18 intrude import standard
::string
33 # UTF-8 char as defined in RFC-3629, e.g. 1-4 Bytes
35 # A UTF-8 char has its bytes stored in a NativeString (char*)
36 extern class UnicodeChar `{ UTF8Char* `}
38 new(pos: Int, ns: NativeString) `{
39 UTF8Char* u
= malloc
(sizeof
(UTF8Char));
45 # Real length of the char in UTF8
47 # As per the specification :
49 # Length | UTF-8 octet sequence
51 # ---------+-------------------------------------------------
53 # 2 | 110xxxxx 10xxxxxx
54 # 3 | 1110xxxx 10xxxxxx 10xxxxxx
55 # 4 | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
56 private fun len: Int `{
60 if((nspos
& 0x80) == 0x00){ return 1;}
61 if((nspos
& 0xE0) == 0xC0){ return 2;}
62 if((nspos
& 0xF0) == 0xE0){ return 3;}
63 if((nspos
& 0xF7) == 0xF0){ return 4;}
68 # Position in containing NativeString
69 private fun pos: Int `{
73 private fun pos=(p: Int) `{recv->pos = p;`}
75 # C char* wrapping the char
76 fun ns
: NativeString `{
80 # Returns the Unicode code point representing the character
82 # Note : A unicode character might not be a visible glyph, but it will be used to determine canonical equivalence
83 fun code_point
: Int import UnicodeChar.len
`{
84 switch(UnicodeChar_len(recv)){
86 return (long)(0x7F & (unsigned char)recv->ns[recv->pos]);
88 return 0 | ((0x1F & (unsigned char)recv->ns[recv->pos]) << 6) | (0x3F & (unsigned char)recv->ns[recv->pos+1]);
90 return 0 | ((0x0F & (unsigned char)recv->ns[recv->pos]) << 12) |
91 ((0x3F & (unsigned char)recv->ns[recv->pos+1]) << 6) |
92 (0x3F & (unsigned char)recv->ns[recv->pos+2]);
94 return 0 | ((0x07 & (unsigned char)recv->ns[recv->pos]) << 18) |
95 ((0x3F & (unsigned char)recv->ns[recv->pos+1]) << 12) |
96 ((0x3F & (unsigned char)recv->ns[recv->pos+2]) << 6) |
97 (0x3F & (unsigned char)recv->ns[recv->pos+3]);
104 if len
!= 1 then return false
105 if code_point
== o
.ascii
then return true
106 else if o
isa UnicodeChar then
107 if len
!= o
.len
then return false
108 if code_point
== o
.code_point
then return true
113 redef fun to_s
import NativeString.to_s_with_length
`{
114 int len = utf8___UnicodeChar_len___impl(recv);
115 char* r = malloc(len + 1);
117 char* src = (recv->ns + recv->pos);
119 return NativeString_to_s_with_length(r, len);
123 # A `StringIndex` is used to keep track of the position of characters in a `FlatString` object
125 # It becomes mandatory for UTF-8 strings since characters do not have a fixed size.
126 private extern class StringIndex `{ UTF8Char* `}
128 new(size: Int) `{ return malloc(size*sizeof(UTF8Char)); `}
130 # Sets the character at `index` as `item`
131 fun []=(index
: Int, item
: UnicodeChar) `{ recv[index] = *item; `}
133 # Gets the character at position `id
`
134 fun [](id: Int): UnicodeChar `{ return &recv[id]; `}
136 # Copies a part of self starting at index `my_from` of length `length` into `other`, starting at `its_from`
137 fun copy_to
(other
: StringIndex, my_from
: Int, its_from
: Int, length
: Int)`{
138 UTF8Char* myfrom = recv + my_from*(sizeof(UTF8Char));
139 UTF8Char* itsfrom = other + its_from*(sizeof(UTF8Char));
140 memcpy(itsfrom, myfrom, length);
144 redef class FlatString
146 # Index of the characters of the FlatString
147 private var index
: StringIndex
149 # Length in bytes of the string (e.g. the length of the C string)
152 private init with_infos_index
(items
: NativeString, len
: Int, index_from
: Int, index_to
: Int, index
: StringIndex, bytelen
: Int)
156 self.index_from
= index_from
157 self.index_to
= index_to
159 self.bytelen
= bytelen
164 var native
= calloc_string
(self.bytelen
+ 1)
165 var length
= self.length
166 var index
= self.index
170 var new_index
= new StringIndex(length
)
171 var pos_index
= length
174 var uchar_len
= uchar
.len
176 new_index
[pos_index
] = new UnicodeChar(ipos
, native
)
178 items
.copy_to
(native
, uchar_len
, pos
, ipos
)
182 return new FlatString.with_infos_index
(native
, length
, 0, length-1
, new_index
, bytelen
)
189 var mylen
= self.bytelen
190 var finlen
= mylen
* i
192 var my_items
= self.items
194 var my_real_len
= length
195 var my_real_fin_len
= my_real_len
* i
197 var target_string
= calloc_string
((finlen
) + 1)
200 var new_index
= new StringIndex(my_real_fin_len
)
202 target_string
[finlen
] = '\0'
207 for iteration
in [1 .. i
] do
208 my_items
.copy_to
(target_string
, mylen
, index_from
, current_last
)
209 my_index
.copy_to
(new_index
, length
, 0, curr_index
)
210 current_last
+= mylen
213 return new FlatString.with_infos_index
(target_string
, my_real_fin_len
, 0, my_real_fin_len
-1, new_index
, finlen
)
219 redef class NativeString
221 # Creates the index for said NativeString
222 # `length` is the size of the CString (in bytes, up to the first \0)
223 # real_len is just a way to store the length (UTF-8 characters)
224 private fun make_index
(length
: Int, real_len
: Container[Int]): StringIndex import Container[Int].item
=, UnicodeChar.len
`{
227 UTF8Char* index = malloc(length*sizeof(UTF8Char));
229 UTF8Char* curr = &index[index_pos];
232 pos += UnicodeChar_len(curr);
235 Container_of_Int_item__assign(real_len, index_pos);
239 redef fun to_s
: FlatString
241 var len
= cstring_length
242 return to_s_with_length
(len
)
245 redef fun to_s_with_length
(len
: Int): FlatString
247 var real_len
= new Container[Int](0)
248 var x
= make_index
(len
, real_len
)
249 return new FlatString.with_infos_index
(self, real_len
.item
, 0, real_len
.item
- 1, x
, len
)
252 redef fun to_s_with_copy
254 var real_len
= new Container[Int](0)
255 var length
= cstring_length
256 var x
= make_index
(length
, real_len
)
257 var new_self
= calloc_string
(length
+ 1)
258 copy_to
(new_self
, length
, 0, 0)
259 return new FlatString.with_infos_index
(new_self
, real_len
.item
, 0, real_len
.item
- 1, x
, length
)