all: fix broken markdown comments with missing or unwanted code blocks
[nit.git] / lib / string_experimentations / utf8.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 # Introduces UTF-8 as internal encoding for Strings in Nit.
16 module utf8
17
18 intrude import standard::string
19 intrude import standard::file
20
21 in "C Header" `{
22
23 #include <stdio.h>
24 #include <string.h>
25 #include <stdint.h>
26
27 typedef struct {
28 long pos;
29 char* ns;
30 } UTF8Char;
31
32 `}
33
34 # UTF-8 char as defined in RFC-3629, e.g. 1-4 Bytes
35 #
36 # A UTF-8 char has its bytes stored in a NativeString (char*)
37 extern class UnicodeChar `{ UTF8Char* `}
38
39 new(pos: Int, ns: NativeString) `{
40 UTF8Char* u = malloc(sizeof(UTF8Char));
41 u->pos = pos;
42 u->ns = ns;
43 return u;
44 `}
45
46 # Real length of the char in UTF8
47 #
48 # As per the specification :
49 #
50 # ~~~raw
51 # Length | UTF-8 octet sequence
52 # | (binary)
53 # ---------+-------------------------------------------------
54 # 1 | 0xxxxxxx
55 # 2 | 110xxxxx 10xxxxxx
56 # 3 | 1110xxxx 10xxxxxx 10xxxxxx
57 # 4 | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
58 # ~~~
59 private fun len: Int `{
60 char* ns = recv->ns;
61 int pos = recv->pos;
62 char nspos = ns[pos];
63 if((nspos & 0x80) == 0x00){ return 1;}
64 if((nspos & 0xE0) == 0xC0){ return 2;}
65 if((nspos & 0xF0) == 0xE0){ return 3;}
66 if((nspos & 0xF7) == 0xF0){ return 4;}
67 // Invalid character
68 return 1;
69 `}
70
71 # Position in containing NativeString
72 private fun pos: Int `{
73 return recv->pos;
74 `}
75
76 private fun pos=(p: Int) `{recv->pos = p;`}
77
78 # C char* wrapping the char
79 fun ns: NativeString `{
80 return recv->ns;
81 `}
82
83 # Returns the Unicode code point representing the character
84 #
85 # Note : A unicode character might not be a visible glyph, but it will be used to determine canonical equivalence
86 fun code_point: Int import UnicodeChar.len `{
87 switch(UnicodeChar_len(recv)){
88 case 1:
89 return (long)(0x7F & (unsigned char)recv->ns[recv->pos]);
90 case 2:
91 return 0 | ((0x1F & (unsigned char)recv->ns[recv->pos]) << 6) | (0x3F & (unsigned char)recv->ns[recv->pos+1]);
92 case 3:
93 return 0 | ((0x0F & (unsigned char)recv->ns[recv->pos]) << 12) |
94 ((0x3F & (unsigned char)recv->ns[recv->pos+1]) << 6) |
95 (0x3F & (unsigned char)recv->ns[recv->pos+2]);
96 case 4:
97 return 0 | ((0x07 & (unsigned char)recv->ns[recv->pos]) << 18) |
98 ((0x3F & (unsigned char)recv->ns[recv->pos+1]) << 12) |
99 ((0x3F & (unsigned char)recv->ns[recv->pos+2]) << 6) |
100 (0x3F & (unsigned char)recv->ns[recv->pos+3]);
101 }
102 `}
103
104 # Returns an upper-case version of self
105 #
106 # NOTE : Works only on ASCII chars
107 # TODO : Support unicode for to_upper
108 fun to_upper: UnicodeChar import UnicodeChar.code_point `{
109 int cp = UnicodeChar_code_point(recv);
110 if(cp < 97 || cp > 122){ return recv; }
111 char* ns = malloc(2);
112 ns[1] = '\0';
113 char c = recv->ns[recv->pos];
114 ns[0] = c - 32;
115 UTF8Char* ret = malloc(sizeof(UTF8Char));
116 ret->ns = ns;
117 ret->pos = 0;
118 return ret;
119 `}
120
121 # Returns an lower-case version of self
122 #
123 # NOTE : Works only on ASCII chars
124 # TODO : Support unicode for to_upper
125 fun to_lower: UnicodeChar import UnicodeChar.code_point `{
126 int cp = UnicodeChar_code_point(recv);
127 if(cp < 65 || cp > 90){ return recv; }
128 char* ns = malloc(2);
129 ns[1] = '\0';
130 char c = recv->ns[recv->pos];
131 ns[0] = c + 32;
132 UTF8Char* ret = malloc(sizeof(UTF8Char));
133 ret->ns = ns;
134 ret->pos = 0;
135 return ret;
136 `}
137
138 redef fun ==(o)
139 do
140 if o isa Char then
141 if len != 1 then return false
142 if code_point == o.ascii then return true
143 else if o isa UnicodeChar then
144 if len != o.len then return false
145 if code_point == o.code_point then return true
146 end
147 return false
148 end
149
150 redef fun output import UnicodeChar.code_point `{
151 switch(UnicodeChar_len(recv)){
152 case 1:
153 printf("%c", recv->ns[recv->pos]);
154 break;
155 case 2:
156 printf("%c%c", recv->ns[recv->pos], recv->ns[recv->pos + 1]);
157 break;
158 case 3:
159 printf("%c%c%c", recv->ns[recv->pos], recv->ns[recv->pos + 1], recv->ns[recv->pos + 2]);
160 break;
161 case 4:
162 printf("%c%c%c%c", recv->ns[recv->pos], recv->ns[recv->pos + 1], recv->ns[recv->pos + 2], recv->ns[recv->pos + 3]);
163 break;
164 }
165 `}
166
167 redef fun to_s import NativeString.to_s_with_length `{
168 int len = utf8___UnicodeChar_len___impl(recv);
169 char* r = malloc(len + 1);
170 r[len] = '\0';
171 char* src = (recv->ns + recv->pos);
172 memcpy(r, src, len);
173 return NativeString_to_s_with_length(r, len);
174 `}
175 end
176
177 # A `StringIndex` is used to keep track of the position of characters in a `FlatString` object
178 #
179 # It becomes mandatory for UTF-8 strings since characters do not have a fixed size.
180 private extern class StringIndex `{ UTF8Char* `}
181
182 new(size: Int) `{ return malloc(size*sizeof(UTF8Char)); `}
183
184 # Sets the character at `index` as `item`
185 fun []=(index: Int, item: UnicodeChar) `{ recv[index] = *item; `}
186
187 # Gets the character at position `id`
188 fun [](id: Int): UnicodeChar `{ return &recv[id]; `}
189
190 # Copies a part of self starting at index `my_from` of length `length` into `other`, starting at `its_from`
191 fun copy_to(other: StringIndex, my_from: Int, its_from: Int, length: Int)`{
192 UTF8Char* myfrom = recv + my_from*(sizeof(UTF8Char));
193 UTF8Char* itsfrom = other + its_from*(sizeof(UTF8Char));
194 memcpy(itsfrom, myfrom, length);
195 `}
196 end
197
198 redef class FlatString
199
200 # Index of the characters of the FlatString
201 private var index: StringIndex
202
203 # Length in bytes of the string (e.g. the length of the C string)
204 var bytelen: Int
205
206 private init with_infos_index(items: NativeString, len: Int, index_from: Int, index_to: Int, index: StringIndex, bytelen: Int)
207 do
208 self.items = items
209 length = len
210 self.index_from = index_from
211 self.index_to = index_to
212 self.index = index
213 self.bytelen = bytelen
214 end
215
216 redef fun to_cstring
217 do
218 if real_items != null then return real_items.as(not null)
219 var new_items = new NativeString(bytelen + 1)
220 self.items.copy_to(new_items, bytelen, index[index_from].pos, 0)
221 new_items[bytelen] = '\0'
222 self.real_items = new_items
223 return new_items
224 end
225
226 redef fun substring(from, count)
227 do
228 assert count >= 0
229
230 if from < 0 then
231 count += from
232 if count < 0 then count = 0
233 from = 0
234 end
235
236 if count == 0 then return empty
237
238 var real_from = index_from + from
239 var real_to = real_from + count - 1
240
241 if real_to > index_to then real_to = index_to
242
243 var sub_bytelen = (index[real_to].pos - index[from].pos) + index[from].len
244
245 return new FlatString.with_infos_index(items, count, real_from, real_to, index, sub_bytelen)
246 end
247
248 redef fun reversed
249 do
250 var native = new NativeString(self.bytelen + 1)
251 var length = self.length
252 var index = self.index
253 var pos = 0
254 var i = 0
255 var ipos = bytelen
256 var new_index = new StringIndex(length)
257 var pos_index = length
258 while i < length do
259 var uchar = index[i]
260 var uchar_len = uchar.len
261 ipos -= uchar_len
262 new_index[pos_index] = new UnicodeChar(ipos, native)
263 pos_index -= 1
264 items.copy_to(native, uchar_len, pos, ipos)
265 pos += uchar_len
266 i += 1
267 end
268 return new FlatString.with_infos_index(native, length, 0, length-1, new_index, bytelen)
269 end
270
271 redef fun *(i)
272 do
273 assert i >= 0
274
275 var mylen = self.bytelen
276 var finlen = mylen * i
277
278 var my_items = self.items
279
280 var my_real_len = length
281 var my_real_fin_len = my_real_len * i
282
283 var target_string = new NativeString((finlen) + 1)
284
285 var my_index = index
286 var new_index = new StringIndex(my_real_fin_len)
287
288 target_string[finlen] = '\0'
289
290 var current_last = 0
291 var curr_index = 0
292
293 for iteration in [1 .. i] do
294 my_items.copy_to(target_string, mylen, index_from, current_last)
295 my_index.copy_to(new_index, length, 0, curr_index)
296 current_last += mylen
297 end
298
299 return new FlatString.with_infos_index(target_string, my_real_fin_len, 0, my_real_fin_len -1, new_index, finlen)
300
301 end
302
303 redef fun to_upper
304 do
305 var outstr = new NativeString(self.bytelen + 1)
306
307 var out_index = 0
308 var index = self.index
309 var ipos = 0
310 var max = length
311 var items = self.items
312
313 while ipos < max do
314 var u = index[ipos].to_upper
315 u.ns.copy_to(outstr, u.len, u.pos, out_index)
316 out_index += u.len
317 ipos += 1
318 end
319
320 outstr[self.bytelen] = '\0'
321
322 return outstr.to_s_with_length(self.bytelen)
323 end
324
325 redef fun to_lower
326 do
327 var outstr = new NativeString(self.bytelen + 1)
328
329 var out_index = 0
330 var index = self.index
331 var ipos = 0
332 var max = length
333 var items = self.items
334
335 while ipos < max do
336 var u = index[ipos].to_lower
337 u.ns.copy_to(outstr, u.len, u.pos, out_index)
338 out_index += u.len
339 ipos += 1
340 end
341
342 outstr[self.bytelen] = '\0'
343
344 return outstr.to_s_with_length(self.bytelen)
345 end
346
347 redef fun output
348 do
349 var i = self.index_from
350 var imax = self.index_to
351 while i <= imax do
352 index[i].output
353 i += 1
354 end
355 end
356
357 end
358
359 redef class FlatBuffer
360
361 # Fix for this particular implementation
362 #
363 # Since the to_s of a FlatBuffer now builds using
364 # the old String contructor, this breaks everything.
365 #
366 # This will disappear when UTF8 is fully-supported
367 redef fun to_s do
368 written = false
369 return to_cstring.to_s_with_length(length)
370 end
371 end
372
373 redef class NativeString
374
375 # Creates the index for said NativeString
376 # `length` is the size of the CString (in bytes, up to the first \0)
377 # real_len is just a way to store the length (UTF-8 characters)
378 private fun make_index(length: Int, real_len: Container[Int]): StringIndex import Container[Int].item=, UnicodeChar.len `{
379 int pos = 0;
380 int index_pos = 0;
381 UTF8Char* index = malloc(length*sizeof(UTF8Char));
382 while(pos < length){
383 UTF8Char* curr = &index[index_pos];
384 curr->pos = pos;
385 curr->ns = recv;
386 pos += UnicodeChar_len(curr);
387 index_pos ++;
388 }
389 Container_of_Int_item__assign(real_len, index_pos);
390 return index;
391 `}
392
393 redef fun to_s: FlatString
394 do
395 var len = cstring_length
396 return to_s_with_length(len)
397 end
398
399 redef fun to_s_with_length(len: Int): FlatString
400 do
401 var real_len = new Container[Int](0)
402 var x = make_index(len, real_len)
403 return new FlatString.with_infos_index(self, real_len.item, 0, real_len.item - 1, x, len)
404 end
405
406 redef fun to_s_with_copy
407 do
408 var real_len = new Container[Int](0)
409 var length = cstring_length
410 var x = make_index(length, real_len)
411 var new_self = new NativeString(length + 1)
412 copy_to(new_self, length, 0, 0)
413 return new FlatString.with_infos_index(new_self, real_len.item, 0, real_len.item - 1, x, length)
414 end
415 end
416
417 redef class OFStream
418 redef fun write(s)
419 do
420 assert is_writable
421 if s isa FlatText then
422 if s isa FlatString then
423 write_native(s.to_cstring, s.bytelen)
424 else
425 write_native(s.to_cstring, s.length)
426 end
427 else for i in s.substrings do write_native(i.to_cstring, i.length)
428 end
429 end