0469a7d67666c6ea080c7cce7fe1e25e4343101b
[nit.git] / lib / core / text / native.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # This file is free software, which comes along with NIT. This software is
4 # distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
5 # without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
6 # PARTICULAR PURPOSE. You can modify it is you want, provided this header
7 # is kept unaltered, and a notification of the changes is added.
8 # You are allowed to redistribute it and sell it, alone or is a part of
9 # another product.
10
11 # Native structures for text and bytes
12 module native
13
14 import kernel
15 import math
16 import fixed_ints
17
18 in "C" `{
19 #ifdef __linux__
20 #include <endian.h>
21 #endif
22 #ifdef __APPLE__
23 #include <libkern/OSByteOrder.h>
24 #define be32toh(x) OSSwapBigToHostInt32(x)
25 #endif
26 #ifdef _WIN32
27 #define be32toh(val) _byteswap_ulong(val)
28 #endif
29
30 #ifdef __pnacl__
31 #define be16toh(val) (((val) >> 8) | ((val) << 8))
32 #define be32toh(val) ((be16toh((val) << 16) | (be16toh((val) >> 16))))
33 #endif
34 #ifndef be32toh
35 #define be32toh(val) betoh32(val)
36 #endif
37 `}
38
39 redef class Byte
40 # Gives the length of the UTF-8 char starting with `self`
41 fun u8len: Int do
42 if self & 0b1000_0000u8 == 0u8 then
43 return 1
44 else if self & 0b1110_0000u8 == 0b1100_0000u8 then
45 return 2
46 else if self & 0b1111_0000u8 == 0b1110_0000u8 then
47 return 3
48 else if self & 0b1111_1000u8 == 0b1111_0000u8 then
49 return 4
50 else
51 return 1
52 end
53 end
54
55 # Is `self` a valid UTF-8 sequence start ?
56 #
57 # ~~~nit
58 # assert 0u8.is_valid_utf8_start
59 # assert 0xC0u8.is_valid_utf8_start
60 # assert 0xE0u8.is_valid_utf8_start
61 # assert 0xF0u8.is_valid_utf8_start
62 # ~~~
63 fun is_valid_utf8_start: Bool do
64 if self & 0x80u8 == 0u8 then return true
65 if self & 0b1110_0000u8 == 0b1100_0000u8 then return true
66 if self & 0b1111_0000u8 == 0b1110_0000u8 then return true
67 if self & 0b1111_1000u8 == 0b1111_0000u8 then return true
68 return false
69 end
70 end
71
72 redef class UInt32
73 # Returns the code_point from a utf16 surrogate pair
74 #
75 # assert 0xD83DDE02u32.from_utf16_surr == 0x1F602u32
76 fun from_utf16_surr: UInt32 do
77 var hi = (self & 0xFFFF0000u32) >> 16
78 var lo = self & 0xFFFFu32
79 var cp = 0u32
80 cp += (hi - 0xD800u32) << 10
81 cp += lo - 0xDC00u32
82 cp += 0x10000u32
83 return cp
84 end
85
86 # The character which code point (unicode-wise) is `self`
87 #
88 # assert 65u32.code_point == 'A'
89 # assert 10u32.code_point == '\n'
90 # assert 0x220Bu32.code_point == '∋'
91 fun code_point: Char `{ return self; `}
92 end
93
94 # C string `char *`
95 #
96 # Used as underlying implementation for `String` and some other `Text`.
97 extern class CString `{ char* `}
98 # Create a new `CString` with the capacity for `length` characters
99 new(length: Int) is intern
100
101 # Get a char* starting at `index`.
102 #
103 # WARNING: Unsafe for extern code, use only for temporary
104 # pointer manipulation purposes (e.g. write to file or such)
105 fun fast_cstring(index: Int): CString is intern
106
107 # Get char at `index`.
108 fun [](index: Int): Byte is intern
109
110 # Set char `item` at index.
111 fun []=(index: Int, item: Byte) is intern
112
113 # Copy `self` to `dest`.
114 fun copy_to(dest: CString, length: Int, from: Int, to: Int) is intern
115
116 redef fun ==(o) is intern do return is_same_instance(o)
117
118 redef fun !=(o) is intern do return not is_same_instance(o)
119
120 # Position of the first nul character.
121 fun cstring_length: Int
122 do
123 var l = 0
124 while self[l] != 0u8 do l += 1
125 return l
126 end
127
128 # Parse `self` as an Int.
129 fun atoi: Int is intern
130
131 # Parse `self` as a Float.
132 fun atof: Float `{ return atof(self); `}
133
134 # Gets the UTF-8 char at index `pos`
135 #
136 # Index is expressed in Unicode chars
137 #
138 # ~~~raw
139 # assert "かきく".as(FlatString).items.char_at(0) == 'か'
140 # ~~~
141 #
142 # If the char at position pos is an invalid Unicode char,
143 # the Unicode replacement character � (0xFFFD) will be used.
144 #
145 # ~~~raw
146 # assert "かきく".as(FlatString).items.char_at(1) == '�'
147 # ~~~
148 fun char_at(pos: Int): Char do
149 var c = self[pos]
150 if c & 0x80u8 == 0u8 then return c.ascii
151 var b = fetch_4_hchars(pos)
152 var ret = 0u32
153 if b & 0xC00000u32 != 0x800000u32 then return 0xFFFD.code_point
154 if b & 0xE0000000u32 == 0xC0000000u32 then
155 ret |= (b & 0x1F000000u32) >> 18
156 ret |= (b & 0x3F0000u32) >> 16
157 return ret.code_point
158 end
159 if not b & 0xC000u32 == 0x8000u32 then return 0xFFFD.code_point
160 if b & 0xF0000000u32 == 0xE0000000u32 then
161 ret |= (b & 0xF000000u32) >> 12
162 ret |= (b & 0x3F0000u32) >> 10
163 ret |= (b & 0x3F00u32) >> 8
164 return ret.code_point
165 end
166 if not b & 0xC0u32 == 0x80u32 then return 0xFFFD.code_point
167 if b & 0xF8000000u32 == 0xF0000000u32 then
168 ret |= (b & 0x7000000u32) >> 6
169 ret |= (b & 0x3F0000u32) >> 4
170 ret |= (b & 0x3F00u32) >> 2
171 ret |= b & 0x3Fu32
172 return ret.code_point
173 end
174 return 0xFFFD.code_point
175 end
176
177 # Gets the byte index of char at position `n` in UTF-8 String
178 fun char_to_byte_index(n: Int): Int do return char_to_byte_index_cached(n, 0, 0)
179
180 # Gets the length of the character at position `pos` (1 if invalid sequence)
181 fun length_of_char_at(pos: Int): Int do
182 var c = self[pos]
183 if c & 0x80u8 == 0x00u8 then
184 return 1
185 else if c & 0xE0u8 == 0xC0u8 and self[pos + 1] & 0xC0u8 == 0x80u8 then
186 return 2
187 else if c & 0xF0u8 == 0xE0u8 and self[pos + 1] & 0xC0u8 == 0x80u8 and self[pos + 2] & 0xC0u8 == 0x80u8 then
188 return 3
189 else if c & 0xF8u8 == 0xF0u8 and self[pos + 1] & 0xC0u8 == 0x80u8 and self[pos + 2] & 0xC0u8 == 0x80u8 and self[pos + 3] & 0xC0u8 == 0x80u8 then
190 return 4
191 else
192 return 1
193 end
194 end
195
196 # Gets the byte index of char at position `n` in UTF-8 String
197 #
198 # `char_from` and `byte_from` are cached values to seek from.
199 #
200 # NOTE: char_from and byte_from are not guaranteed to be valid cache values
201 # It it up to the client to ensure the validity of the information
202 fun char_to_byte_index_cached(n, char_from, byte_from: Int): Int do
203 var ns_i = byte_from
204 var my_i = char_from
205
206 var dist = n - my_i
207
208 while dist > 0 do
209 while dist >= 4 do
210 var i = fetch_4_chars(ns_i)
211 if i & 0x80808080u32 != 0u32 then break
212 ns_i += 4
213 my_i += 4
214 dist -= 4
215 end
216 if dist == 0 then break
217 ns_i += length_of_char_at(ns_i)
218 my_i += 1
219 dist -= 1
220 end
221
222 while dist < 0 do
223 while dist <= -4 do
224 var i = fetch_4_chars(ns_i - 4)
225 if i & 0x80808080u32 != 0u32 then break
226 ns_i -= 4
227 my_i -= 4
228 dist += 4
229 end
230 if dist == 0 then break
231 ns_i = find_beginning_of_char_at(ns_i - 1)
232 my_i -= 1
233 dist += 1
234 end
235
236 return ns_i
237 end
238
239 # Gets the char index of byte at position `n` in a UTF-8 String
240 #
241 # `char_from` and `byte_from` are cached values to seek from.
242 #
243 # NOTE: char_from and byte_from are not guaranteed to be valid cache values
244 # It it up to the client to ensure the validity of the information
245 fun byte_to_char_index_cached(n, char_from, byte_from: Int): Int do
246 var ns_i = byte_from
247 var my_i = char_from
248
249 while ns_i < n do
250 ns_i += length_of_char_at(ns_i)
251 my_i += 1
252 end
253
254 while ns_i > n do
255 ns_i = find_beginning_of_char_at(ns_i - 1)
256 my_i -= 1
257 end
258
259 return my_i
260 end
261
262 # Returns the beginning position of the char at position `pos`
263 #
264 # If the char is invalid UTF-8, `pos` is returned as-is
265 #
266 # ~~~raw
267 # assert "abc".items.find_beginning_of_char_at(2) == 2
268 # assert "か".items.find_beginning_of_char_at(1) == 0
269 # assert [0x41u8, 233u8].to_s.items.find_beginning_of_char_at(1) == 1
270 # ~~~
271 fun find_beginning_of_char_at(pos: Int): Int do
272 var endpos = pos
273 var c = self[pos]
274 if c & 0x80u8 == 0x00u8 then return pos
275 while c & 0xC0u8 == 0x80u8 do
276 pos -= 1
277 c = self[pos]
278 end
279 var stpos = pos
280 if length_of_char_at(stpos) >= (endpos - stpos + 1) then return pos
281 return endpos
282 end
283
284 # Number of UTF-8 characters in `self` starting at `from`, for a length of `byte_length`
285 fun utf8_length(from, byte_length: Int): Int is intern do
286 var st = from
287 var ln = 0
288 while byte_length > 0 do
289 while byte_length >= 4 do
290 var i = fetch_4_chars(st)
291 if i & 0x80808080u32 != 0u32 then break
292 byte_length -= 4
293 st += 4
294 ln += 4
295 end
296 if byte_length == 0 then break
297 var cln = length_of_char_at(st)
298 st += cln
299 ln += 1
300 byte_length -= cln
301 end
302 return ln
303 end
304
305 # Fetch 4 chars in `self` at `pos`
306 fun fetch_4_chars(pos: Int): UInt32 is intern `{ return *((uint32_t*)(self+pos)); `}
307
308 # Fetch 4 chars in `self` at `pos`
309 fun fetch_4_hchars(pos: Int): UInt32 is intern `{ return (uint32_t)be32toh(*((uint32_t*)(self+pos))); `}
310
311 # Right shifts `len` bytes of `self` from `sh` bytes starting at position `pos`
312 fun rshift(sh, len, pos: Int) do
313 copy_to(self, len, pos, pos + sh)
314 end
315
316 # Left shifts `len` bytes of `self` from `sh` bytes starting at position `pos`
317 fun lshift(sh, len, pos: Int) do
318 copy_to(self, len, pos, pos - sh)
319 end
320 end