1 # This file is part of NIT ( http://www.nitlanguage.org ).
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
7 # http://www.apache.org/licenses/LICENSE-2.0
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
15 # Introduces UTF-8 as internal encoding for Strings in Nit.
18 intrude import standard
::string
19 intrude import standard
::file
34 # UTF-8 char as defined in RFC-3629, e.g. 1-4 Bytes
36 # A UTF-8 char has its bytes stored in a NativeString (char*)
37 extern class UTF8Char `{ UTF8Char* `}
39 new(pos: Int, ns: NativeString) `{
40 UTF8Char* u
= malloc
(sizeof
(UTF8Char));
46 # Real length of the char in UTF8
48 # As per the specification :
51 # Length | UTF-8 octet sequence
53 # ---------+-------------------------------------------------
55 # 2 | 110xxxxx 10xxxxxx
56 # 3 | 1110xxxx 10xxxxxx 10xxxxxx
57 # 4 | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
59 private fun len: Int `{
63 if((nspos
& 0x80) == 0x00){ return 1;}
64 if((nspos
& 0xE0) == 0xC0){ return 2;}
65 if((nspos
& 0xF0) == 0xE0){ return 3;}
66 if((nspos
& 0xF7) == 0xF0){ return 4;}
71 # Position in containing NativeString
72 private fun pos: Int `{
76 private fun pos=(p: Int) `{recv->pos = p;`}
78 # C char* wrapping the char
79 fun ns
: NativeString `{
83 # Returns the Unicode code point representing the character
85 # Note : A unicode character might not be a visible glyph, but it will be used to determine canonical equivalence
86 fun code_point
: Int import UTF8Char.len
`{
87 switch(UTF8Char_len(recv)){
89 return (long)(0x7F & (unsigned char)recv->ns[recv->pos]);
91 return 0 | ((0x1F & (unsigned char)recv->ns[recv->pos]) << 6) | (0x3F & (unsigned char)recv->ns[recv->pos+1]);
93 return 0 | ((0x0F & (unsigned char)recv->ns[recv->pos]) << 12) |
94 ((0x3F & (unsigned char)recv->ns[recv->pos+1]) << 6) |
95 (0x3F & (unsigned char)recv->ns[recv->pos+2]);
97 return 0 | ((0x07 & (unsigned char)recv->ns[recv->pos]) << 18) |
98 ((0x3F & (unsigned char)recv->ns[recv->pos+1]) << 12) |
99 ((0x3F & (unsigned char)recv->ns[recv->pos+2]) << 6) |
100 (0x3F & (unsigned char)recv->ns[recv->pos+3]);
104 # Returns an upper-case version of self
106 # NOTE : Works only on ASCII chars
107 # TODO : Support unicode for to_upper
108 fun to_upper
: UTF8Char import UTF8Char.code_point
`{
109 int cp = UTF8Char_code_point(recv);
110 if(cp < 97 || cp > 122){ return recv; }
111 char* ns = malloc(2);
113 char c = recv->ns[recv->pos];
115 UTF8Char* ret = malloc(sizeof(UTF8Char));
121 # Returns an lower-case version of self
123 # NOTE : Works only on ASCII chars
124 # TODO : Support unicode for to_upper
125 fun to_lower
: UTF8Char import UTF8Char.code_point
`{
126 int cp = UTF8Char_code_point(recv);
127 if(cp < 65 || cp > 90){ return recv; }
128 char* ns = malloc(2);
130 char c = recv->ns[recv->pos];
132 UTF8Char* ret = malloc(sizeof(UTF8Char));
141 if len
!= 1 then return false
142 if code_point
== o
.ascii
then return true
143 else if o
isa UTF8Char then
144 if len
!= o
.len
then return false
145 if code_point
== o
.code_point
then return true
150 redef fun output
import UTF8Char.code_point
`{
151 switch(UTF8Char_len(recv)){
153 printf("%c", recv->ns[recv->pos]);
156 printf("%c%c", recv->ns[recv->pos], recv->ns[recv->pos + 1]);
159 printf("%c%c%c", recv->ns[recv->pos], recv->ns[recv->pos + 1], recv->ns[recv->pos + 2]);
162 printf("%c%c%c%c", recv->ns[recv->pos], recv->ns[recv->pos + 1], recv->ns[recv->pos + 2], recv->ns[recv->pos + 3]);
167 redef fun to_s
import NativeString.to_s_with_length
`{
168 int len = utf8___UTF8Char_len___impl(recv);
169 char* r = malloc(len + 1);
171 char* src = (recv->ns + recv->pos);
173 return NativeString_to_s_with_length(r, len);
177 # A `StringIndex` is used to keep track of the position of characters in a `FlatString` object
179 # It becomes mandatory for UTF-8 strings since characters do not have a fixed size.
180 private extern class StringIndex `{ UTF8Char* `}
182 new(size: Int) `{ return malloc(size*sizeof(UTF8Char)); `}
184 # Sets the character at `index` as `item`
185 fun []=(index
: Int, item
: UTF8Char) `{ recv[index] = *item; `}
187 # Gets the character at position `id
`
188 fun [](id: Int): UTF8Char `{ return &recv[id]; `}
190 # Copies a part of self starting at index `my_from` of length `length` into `other`, starting at `its_from`
191 fun copy_to
(other
: StringIndex, my_from
: Int, its_from
: Int, length
: Int)`{
192 UTF8Char* myfrom = recv + my_from*(sizeof(UTF8Char));
193 UTF8Char* itsfrom = other + its_from*(sizeof(UTF8Char));
194 memcpy(itsfrom, myfrom, length);
198 redef class FlatString
200 # Index of the characters of the FlatString
201 private var index
: StringIndex
203 # Length in bytes of the string (e.g. the length of the C string)
206 private init with_infos_index
(items
: NativeString, len
: Int, index_from
: Int, index_to
: Int, index
: StringIndex, bytelen
: Int)
210 self.index_from
= index_from
211 self.index_to
= index_to
213 self.bytelen
= bytelen
218 if real_items
!= null then return real_items
.as(not null)
219 var new_items
= new NativeString(bytelen
+ 1)
220 self.items
.copy_to
(new_items
, bytelen
, index
[index_from
].pos
, 0)
221 new_items
[bytelen
] = '\0'
222 self.real_items
= new_items
226 redef fun substring
(from
, count
)
232 if count
< 0 then count
= 0
236 if count
== 0 then return empty
238 var real_from
= index_from
+ from
239 var real_to
= real_from
+ count
- 1
241 if real_to
> index_to
then real_to
= index_to
243 var sub_bytelen
= (index
[real_to
].pos
- index
[from
].pos
) + index
[from
].len
245 return new FlatString.with_infos_index
(items
, count
, real_from
, real_to
, index
, sub_bytelen
)
250 var native
= new NativeString(self.bytelen
+ 1)
251 var length
= self.length
252 var index
= self.index
256 var new_index
= new StringIndex(length
)
257 var pos_index
= length
260 var uchar_len
= uchar
.len
262 new_index
[pos_index
] = new UTF8Char(ipos
, native
)
264 items
.copy_to
(native
, uchar_len
, pos
, ipos
)
268 return new FlatString.with_infos_index
(native
, length
, 0, length-1
, new_index
, bytelen
)
275 var mylen
= self.bytelen
276 var finlen
= mylen
* i
278 var my_items
= self.items
280 var my_real_len
= length
281 var my_real_fin_len
= my_real_len
* i
283 var target_string
= new NativeString((finlen
) + 1)
286 var new_index
= new StringIndex(my_real_fin_len
)
288 target_string
[finlen
] = '\0'
293 for iteration
in [1 .. i
] do
294 my_items
.copy_to
(target_string
, mylen
, index_from
, current_last
)
295 my_index
.copy_to
(new_index
, length
, 0, curr_index
)
296 current_last
+= mylen
299 return new FlatString.with_infos_index
(target_string
, my_real_fin_len
, 0, my_real_fin_len
-1, new_index
, finlen
)
305 var outstr
= new NativeString(self.bytelen
+ 1)
308 var index
= self.index
313 var u
= index
[ipos
].to_upper
314 u
.ns
.copy_to
(outstr
, u
.len
, u
.pos
, out_index
)
319 outstr
[self.bytelen
] = '\0'
321 return outstr
.to_s_with_length
(self.bytelen
)
326 var outstr
= new NativeString(self.bytelen
+ 1)
329 var index
= self.index
334 var u
= index
[ipos
].to_lower
335 u
.ns
.copy_to
(outstr
, u
.len
, u
.pos
, out_index
)
340 outstr
[self.bytelen
] = '\0'
342 return outstr
.to_s_with_length
(self.bytelen
)
347 var i
= self.index_from
348 var imax
= self.index_to
357 redef class FlatBuffer
359 # Fix for this particular implementation
361 # Since the to_s of a FlatBuffer now builds using
362 # the old String contructor, this breaks everything.
364 # This will disappear when UTF8 is fully-supported
367 return to_cstring
.to_s_with_length
(length
)
371 redef class NativeString
373 # Creates the index for said NativeString
374 # `length` is the size of the CString (in bytes, up to the first \0)
375 # real_len is just a way to store the length (UTF-8 characters)
376 private fun make_index
(length
: Int, real_len
: Container[Int]): StringIndex import Container[Int].item
=, UTF8Char.len
`{
379 UTF8Char* index = malloc(length*sizeof(UTF8Char));
381 UTF8Char* curr = &index[index_pos];
384 pos += UTF8Char_len(curr);
387 Container_of_Int_item__assign(real_len, index_pos);
391 redef fun to_s
: FlatString
393 var len
= cstring_length
394 return to_s_with_length
(len
)
397 redef fun to_s_with_length
(len
: Int): FlatString
399 var real_len
= new Container[Int](0)
400 var x
= make_index
(len
, real_len
)
401 return new FlatString.with_infos_index
(self, real_len
.item
, 0, real_len
.item
- 1, x
, len
)
404 redef fun to_s_with_copy
406 var real_len
= new Container[Int](0)
407 var length
= cstring_length
408 var x
= make_index
(length
, real_len
)
409 var new_self
= new NativeString(length
+ 1)
410 copy_to
(new_self
, length
, 0, 0)
411 return new FlatString.with_infos_index
(new_self
, real_len
.item
, 0, real_len
.item
- 1, x
, length
)
419 if s
isa FlatText then
420 if s
isa FlatString then
421 write_native
(s
.to_cstring
, s
.bytelen
)
423 write_native
(s
.to_cstring
, s
.length
)
425 else for i
in s
.substrings
do write_native
(i
.to_cstring
, i
.length
)