lib/string_exp/utf8: Adapted reverse service on UTF-8 strings.
[nit.git] / lib / string_experimentations / utf8.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 # Introduces UTF-8 as internal encoding for Strings in Nit.
16 module utf8
17
18 intrude import standard::string
19
20 in "C Header" `{
21
22 #include <stdio.h>
23 #include <string.h>
24 #include <stdint.h>
25
26 typedef struct {
27 long pos;
28 char* ns;
29 } UTF8Char;
30
31 `}
32
33 # UTF-8 char as defined in RFC-3629, e.g. 1-4 Bytes
34 #
35 # A UTF-8 char has its bytes stored in a NativeString (char*)
36 extern class UnicodeChar `{ UTF8Char* `}
37
38 new(pos: Int, ns: NativeString) `{
39 UTF8Char* u = malloc(sizeof(UTF8Char));
40 u->pos = pos;
41 u->ns = ns;
42 return u;
43 `}
44
45 # Real length of the char in UTF8
46 #
47 # As per the specification :
48 #
49 # Length | UTF-8 octet sequence
50 # | (binary)
51 # ---------+-------------------------------------------------
52 # 1 | 0xxxxxxx
53 # 2 | 110xxxxx 10xxxxxx
54 # 3 | 1110xxxx 10xxxxxx 10xxxxxx
55 # 4 | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
56 private fun len: Int `{
57 char* ns = recv->ns;
58 int pos = recv->pos;
59 char nspos = ns[pos];
60 if((nspos & 0x80) == 0x00){ return 1;}
61 if((nspos & 0xE0) == 0xC0){ return 2;}
62 if((nspos & 0xF0) == 0xE0){ return 3;}
63 if((nspos & 0xF7) == 0xF0){ return 4;}
64 // Invalid character
65 return 1;
66 `}
67
68 # Position in containing NativeString
69 private fun pos: Int `{
70 return recv->pos;
71 `}
72
73 private fun pos=(p: Int) `{recv->pos = p;`}
74
75 # C char* wrapping the char
76 fun ns: NativeString `{
77 return recv->ns;
78 `}
79
80 # Returns the Unicode code point representing the character
81 #
82 # Note : A unicode character might not be a visible glyph, but it will be used to determine canonical equivalence
83 fun code_point: Int import UnicodeChar.len `{
84 switch(UnicodeChar_len(recv)){
85 case 1:
86 return (long)(0x7F & (unsigned char)recv->ns[recv->pos]);
87 case 2:
88 return 0 | ((0x1F & (unsigned char)recv->ns[recv->pos]) << 6) | (0x3F & (unsigned char)recv->ns[recv->pos+1]);
89 case 3:
90 return 0 | ((0x0F & (unsigned char)recv->ns[recv->pos]) << 12) |
91 ((0x3F & (unsigned char)recv->ns[recv->pos+1]) << 6) |
92 (0x3F & (unsigned char)recv->ns[recv->pos+2]);
93 case 4:
94 return 0 | ((0x07 & (unsigned char)recv->ns[recv->pos]) << 18) |
95 ((0x3F & (unsigned char)recv->ns[recv->pos+1]) << 12) |
96 ((0x3F & (unsigned char)recv->ns[recv->pos+2]) << 6) |
97 (0x3F & (unsigned char)recv->ns[recv->pos+3]);
98 }
99 `}
100
101 redef fun ==(o)
102 do
103 if o isa Char then
104 if len != 1 then return false
105 if code_point == o.ascii then return true
106 else if o isa UnicodeChar then
107 if len != o.len then return false
108 if code_point == o.code_point then return true
109 end
110 return false
111 end
112
113 redef fun to_s import NativeString.to_s_with_length `{
114 int len = utf8___UnicodeChar_len___impl(recv);
115 char* r = malloc(len + 1);
116 r[len] = '\0';
117 char* src = (recv->ns + recv->pos);
118 memcpy(r, src, len);
119 return NativeString_to_s_with_length(r, len);
120 `}
121 end
122
123 # A `StringIndex` is used to keep track of the position of characters in a `FlatString` object
124 #
125 # It becomes mandatory for UTF-8 strings since characters do not have a fixed size.
126 private extern class StringIndex `{ UTF8Char* `}
127
128 new(size: Int) `{ return malloc(size*sizeof(UTF8Char)); `}
129
130 # Sets the character at `index` as `item`
131 fun []=(index: Int, item: UnicodeChar) `{ recv[index] = *item; `}
132
133 # Gets the character at position `id`
134 fun [](id: Int): UnicodeChar `{ return &recv[id]; `}
135
136 # Copies a part of self starting at index `my_from` of length `length` into `other`, starting at `its_from`
137 fun copy_to(other: StringIndex, my_from: Int, its_from: Int, length: Int)`{
138 UTF8Char* myfrom = recv + my_from*(sizeof(UTF8Char));
139 UTF8Char* itsfrom = other + its_from*(sizeof(UTF8Char));
140 memcpy(itsfrom, myfrom, length);
141 `}
142 end
143
144 redef class FlatString
145
146 # Index of the characters of the FlatString
147 private var index: StringIndex
148
149 # Length in bytes of the string (e.g. the length of the C string)
150 var bytelen: Int
151
152 private init with_infos_index(items: NativeString, len: Int, index_from: Int, index_to: Int, index: StringIndex, bytelen: Int)
153 do
154 self.items = items
155 length = len
156 self.index_from = index_from
157 self.index_to = index_to
158 self.index = index
159 self.bytelen = bytelen
160 end
161
162 redef fun reversed
163 do
164 var native = calloc_string(self.bytelen + 1)
165 var length = self.length
166 var index = self.index
167 var pos = 0
168 var i = 0
169 var ipos = bytelen
170 var new_index = new StringIndex(length)
171 var pos_index = length
172 while i < length do
173 var uchar = index[i]
174 var uchar_len = uchar.len
175 ipos -= uchar_len
176 new_index[pos_index] = new UnicodeChar(ipos, native)
177 pos_index -= 1
178 items.copy_to(native, uchar_len, pos, ipos)
179 pos += uchar_len
180 i += 1
181 end
182 return new FlatString.with_infos_index(native, length, 0, length-1, new_index, bytelen)
183 end
184
185 redef fun *(i)
186 do
187 assert i >= 0
188
189 var mylen = self.bytelen
190 var finlen = mylen * i
191
192 var my_items = self.items
193
194 var my_real_len = length
195 var my_real_fin_len = my_real_len * i
196
197 var target_string = calloc_string((finlen) + 1)
198
199 var my_index = index
200 var new_index = new StringIndex(my_real_fin_len)
201
202 target_string[finlen] = '\0'
203
204 var current_last = 0
205 var curr_index = 0
206
207 for iteration in [1 .. i] do
208 my_items.copy_to(target_string, mylen, index_from, current_last)
209 my_index.copy_to(new_index, length, 0, curr_index)
210 current_last += mylen
211 end
212
213 return new FlatString.with_infos_index(target_string, my_real_fin_len, 0, my_real_fin_len -1, new_index, finlen)
214
215 end
216
217 end
218
219 redef class NativeString
220
221 # Creates the index for said NativeString
222 # `length` is the size of the CString (in bytes, up to the first \0)
223 # real_len is just a way to store the length (UTF-8 characters)
224 private fun make_index(length: Int, real_len: Container[Int]): StringIndex import Container[Int].item=, UnicodeChar.len `{
225 int pos = 0;
226 int index_pos = 0;
227 UTF8Char* index = malloc(length*sizeof(UTF8Char));
228 while(pos < length){
229 UTF8Char* curr = &index[index_pos];
230 curr->pos = pos;
231 curr->ns = recv;
232 pos += UnicodeChar_len(curr);
233 index_pos ++;
234 }
235 Container_of_Int_item__assign(real_len, index_pos);
236 return index;
237 `}
238
239 redef fun to_s: FlatString
240 do
241 var len = cstring_length
242 return to_s_with_length(len)
243 end
244
245 redef fun to_s_with_length(len: Int): FlatString
246 do
247 var real_len = new Container[Int](0)
248 var x = make_index(len, real_len)
249 return new FlatString.with_infos_index(self, real_len.item, 0, real_len.item - 1, x, len)
250 end
251
252 redef fun to_s_with_copy
253 do
254 var real_len = new Container[Int](0)
255 var length = cstring_length
256 var x = make_index(length, real_len)
257 var new_self = calloc_string(length + 1)
258 copy_to(new_self, length, 0, 0)
259 return new FlatString.with_infos_index(new_self, real_len.item, 0, real_len.item - 1, x, length)
260 end
261 end