Merge: Windows: fix bootstrap
[nit.git] / lib / core / text / native.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # This file is free software, which comes along with NIT. This software is
4 # distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
5 # without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
6 # PARTICULAR PURPOSE. You can modify it is you want, provided this header
7 # is kept unaltered, and a notification of the changes is added.
8 # You are allowed to redistribute it and sell it, alone or is a part of
9 # another product.
10
11 # Native structures for text and bytes
12 module native
13
14 import kernel
15 import math
16 import fixed_ints
17
18 in "C" `{
19 #ifdef __linux__
20 #include <endian.h>
21 #endif
22 #ifdef __APPLE__
23 #include <libkern/OSByteOrder.h>
24 #define be32toh(x) OSSwapBigToHostInt32(x)
25 #endif
26 #ifdef _WIN32
27 #define be32toh(val) _byteswap_ulong(val)
28 #endif
29
30 #ifndef be32toh
31 #define be32toh(val) betoh32(val)
32 #endif
33
34 #include <assert.h>
35 #include <string.h>
36 `}
37
38 redef class Int
39 # Gives the length of the UTF-8 char starting with `self`
40 fun u8len: Int do
41 if self & 0b1000_0000 == 0 then
42 return 1
43 else if self & 0b1110_0000 == 0b1100_0000 then
44 return 2
45 else if self & 0b1111_0000 == 0b1110_0000 then
46 return 3
47 else if self & 0b1111_1000 == 0b1111_0000 then
48 return 4
49 else
50 return 1
51 end
52 end
53
54 # Is `self` a valid UTF-8 sequence start ?
55 #
56 # ~~~nit
57 # assert 0.is_valid_utf8_start
58 # assert 0xC0.is_valid_utf8_start
59 # assert 0xE0.is_valid_utf8_start
60 # assert 0xF0.is_valid_utf8_start
61 # ~~~
62 fun is_valid_utf8_start: Bool do
63 if self & 0x80 == 0 then return true
64 if self & 0b1110_0000 == 0b1100_0000 then return true
65 if self & 0b1111_0000 == 0b1110_0000 then return true
66 if self & 0b1111_1000 == 0b1111_0000 then return true
67 return false
68 end
69 end
70
71 redef class UInt32
72 # Returns the code_point from a utf16 surrogate pair
73 #
74 # assert 0xD83DDE02u32.from_utf16_surr == 0x1F602u32
75 fun from_utf16_surr: UInt32 do
76 var hi = (self & 0xFFFF0000u32) >> 16
77 var lo = self & 0xFFFFu32
78 var cp = 0u32
79 cp += (hi - 0xD800u32) << 10
80 cp += lo - 0xDC00u32
81 cp += 0x10000u32
82 return cp
83 end
84
85 # The character which code point (unicode-wise) is `self`
86 #
87 # assert 65u32.code_point == 'A'
88 # assert 10u32.code_point == '\n'
89 # assert 0x220Bu32.code_point == '∋'
90 fun code_point: Char `{ return self; `}
91 end
92
93 # C string `char *`
94 #
95 # Used as underlying implementation for `String` and some other `Text`.
96 extern class CString `{ char* `}
97 # Create a new `CString` with the capacity for `length` characters
98 new(length: Int) is intern
99
100 # Get a char* starting at `index`.
101 #
102 # WARNING: Unsafe for extern code, use only for temporary
103 # pointer manipulation purposes (e.g. write to file or such)
104 fun fast_cstring(index: Int): CString is intern
105
106 # Get char at `index`.
107 fun [](index: Int): Int is intern
108
109 # Set char `item` at index.
110 fun []=(index: Int, item: Int) is intern
111
112 # Copy `self` to `dest`.
113 fun copy_to(dest: CString, length: Int, from: Int, to: Int) is intern
114
115 redef fun ==(o) is intern do return is_same_instance(o)
116
117 redef fun !=(o) is intern do return not is_same_instance(o)
118
119 # Position of the first nul character.
120 fun cstring_length: Int
121 do
122 var l = 0
123 while self[l] != 0 do l += 1
124 return l
125 end
126
127 # Parse `self` as an Int.
128 fun atoi: Int is intern
129
130 # Parse `self` as a Float.
131 fun atof: Float `{ return atof(self); `}
132
133 # Gets the UTF-8 char at index `pos`
134 #
135 # Index is expressed in Unicode chars
136 #
137 # ~~~raw
138 # assert "かきく".as(FlatString).items.char_at(0) == 'か'
139 # ~~~
140 #
141 # If the char at position pos is an invalid Unicode char,
142 # the Unicode replacement character � (0xFFFD) will be used.
143 #
144 # ~~~raw
145 # assert "かきく".as(FlatString).items.char_at(1) == '�'
146 # ~~~
147 fun char_at(pos: Int): Char do
148 var c = self[pos]
149 if c & 0x80 == 0 then return c.code_point
150 var b = fetch_4_hchars(pos)
151 var ret = 0u32
152 if b & 0xC00000u32 != 0x800000u32 then return 0xFFFD.code_point
153 if b & 0xE0000000u32 == 0xC0000000u32 then
154 ret |= (b & 0x1F000000u32) >> 18
155 ret |= (b & 0x3F0000u32) >> 16
156 return ret.code_point
157 end
158 if not b & 0xC000u32 == 0x8000u32 then return 0xFFFD.code_point
159 if b & 0xF0000000u32 == 0xE0000000u32 then
160 ret |= (b & 0xF000000u32) >> 12
161 ret |= (b & 0x3F0000u32) >> 10
162 ret |= (b & 0x3F00u32) >> 8
163 return ret.code_point
164 end
165 if not b & 0xC0u32 == 0x80u32 then return 0xFFFD.code_point
166 if b & 0xF8000000u32 == 0xF0000000u32 then
167 ret |= (b & 0x7000000u32) >> 6
168 ret |= (b & 0x3F0000u32) >> 4
169 ret |= (b & 0x3F00u32) >> 2
170 ret |= b & 0x3Fu32
171 return ret.code_point
172 end
173 return 0xFFFD.code_point
174 end
175
176 # Gets the byte index of char at position `n` in UTF-8 String
177 fun char_to_byte_index(n: Int): Int do return char_to_byte_index_cached(n, 0, 0)
178
179 # Gets the length of the character at position `pos` (1 if invalid sequence)
180 fun length_of_char_at(pos: Int): Int do
181 var c = self[pos]
182 if c & 0x80 == 0x00 then
183 return 1
184 else if c & 0xE0 == 0xC0 and self[pos + 1] & 0xC0 == 0x80 then
185 return 2
186 else if c & 0xF0 == 0xE0 and self[pos + 1] & 0xC0 == 0x80 and self[pos + 2] & 0xC0 == 0x80 then
187 return 3
188 else if c & 0xF8 == 0xF0 and self[pos + 1] & 0xC0 == 0x80 and self[pos + 2] & 0xC0 == 0x80 and self[pos + 3] & 0xC0 == 0x80 then
189 return 4
190 else
191 return 1
192 end
193 end
194
195 # Gets the byte index of char at position `n` in UTF-8 String
196 #
197 # `char_from` and `byte_from` are cached values to seek from.
198 #
199 # NOTE: char_from and byte_from are not guaranteed to be valid cache values
200 # It it up to the client to ensure the validity of the information
201 fun char_to_byte_index_cached(n, char_from, byte_from: Int): Int do
202 var ns_i = byte_from
203 var my_i = char_from
204
205 var dist = n - my_i
206
207 while dist > 0 do
208 while dist >= 4 do
209 var i = fetch_4_chars(ns_i)
210 if i & 0x80808080u32 != 0u32 then break
211 ns_i += 4
212 my_i += 4
213 dist -= 4
214 end
215 if dist == 0 then break
216 ns_i += length_of_char_at(ns_i)
217 my_i += 1
218 dist -= 1
219 end
220
221 while dist < 0 do
222 while dist <= -4 do
223 var i = fetch_4_chars(ns_i - 4)
224 if i & 0x80808080u32 != 0u32 then break
225 ns_i -= 4
226 my_i -= 4
227 dist += 4
228 end
229 if dist == 0 then break
230 ns_i = find_beginning_of_char_at(ns_i - 1)
231 my_i -= 1
232 dist += 1
233 end
234
235 return ns_i
236 end
237
238 # Gets the char index of byte at position `n` in a UTF-8 String
239 #
240 # `char_from` and `byte_from` are cached values to seek from.
241 #
242 # NOTE: char_from and byte_from are not guaranteed to be valid cache values
243 # It it up to the client to ensure the validity of the information
244 fun byte_to_char_index_cached(n, char_from, byte_from: Int): Int do
245 var ns_i = byte_from
246 var my_i = char_from
247
248 while ns_i < n do
249 ns_i += length_of_char_at(ns_i)
250 my_i += 1
251 end
252
253 while ns_i > n do
254 ns_i = find_beginning_of_char_at(ns_i - 1)
255 my_i -= 1
256 end
257
258 return my_i
259 end
260
261 # Returns the beginning position of the char at position `pos`
262 #
263 # If the char is invalid UTF-8, `pos` is returned as-is
264 #
265 # ~~~raw
266 # assert "abc".items.find_beginning_of_char_at(2) == 2
267 # assert "か".items.find_beginning_of_char_at(1) == 0
268 # assert [0x41, 233].to_s.items.find_beginning_of_char_at(1) == 1
269 # ~~~
270 fun find_beginning_of_char_at(pos: Int): Int do
271 var endpos = pos
272 var c = self[pos]
273 if c & 0x80 == 0x00 then return pos
274 while c & 0xC0 == 0x80 do
275 pos -= 1
276 c = self[pos]
277 end
278 var stpos = pos
279 if length_of_char_at(stpos) >= (endpos - stpos + 1) then return pos
280 return endpos
281 end
282
283 # Number of UTF-8 characters in `self` starting at `from`, for a length of `byte_length`
284 fun utf8_length(from, byte_length: Int): Int is intern do
285 var st = from
286 var ln = 0
287 while byte_length > 0 do
288 while byte_length >= 4 do
289 var i = fetch_4_chars(st)
290 if i & 0x80808080u32 != 0u32 then break
291 byte_length -= 4
292 st += 4
293 ln += 4
294 end
295 if byte_length == 0 then break
296 var cln = length_of_char_at(st)
297 st += cln
298 ln += 1
299 byte_length -= cln
300 end
301 return ln
302 end
303
304 # Fetch 4 chars in `self` at `pos`
305 fun fetch_4_chars(pos: Int): UInt32 is intern `{ return *((uint32_t*)(self+pos)); `}
306
307 # Fetch 4 chars in `self` at `pos`
308 fun fetch_4_hchars(pos: Int): UInt32 is intern `{ return (uint32_t)be32toh(*((uint32_t*)(self+pos))); `}
309
310 # Right shifts `len` bytes of `self` from `sh` bytes starting at position `pos`
311 fun rshift(sh, len, pos: Int) do
312 copy_to(self, len, pos, pos + sh)
313 end
314
315 # Left shifts `len` bytes of `self` from `sh` bytes starting at position `pos`
316 fun lshift(sh, len, pos: Int) do
317 copy_to(self, len, pos, pos - sh)
318 end
319
320 # Sets the contents of `self` to `value` for `len` bytes
321 fun memset(value, len: Int) `{
322 assert(len >= 0);
323 memset(self, value, len);
324 `}
325 end