0ff3787aff641e6dfe680680419cc894a4e6927b
[nit.git] / lib / core / text / native.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # This file is free software, which comes along with NIT. This software is
4 # distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
5 # without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
6 # PARTICULAR PURPOSE. You can modify it is you want, provided this header
7 # is kept unaltered, and a notification of the changes is added.
8 # You are allowed to redistribute it and sell it, alone or is a part of
9 # another product.
10
11 # Native structures for text and bytes
12 module native
13
14 import kernel
15 import math
16
17 in "C" `{
18 #ifdef __linux__
19 #include <endian.h>
20 #endif
21 #ifdef __APPLE__
22 #include <libkern/OSByteOrder.h>
23 #define be32toh(x) OSSwapBigToHostInt32(x)
24 #endif
25 #ifdef _WIN32
26 #define be32toh(val) _byteswap_ulong(val)
27 #endif
28
29 #ifdef __pnacl__
30 #define be16toh(val) (((val) >> 8) | ((val) << 8))
31 #define be32toh(val) ((be16toh((val) << 16) | (be16toh((val) >> 16))))
32 #endif
33 #ifndef be32toh
34 #define be32toh(val) betoh32(val)
35 #endif
36 `}
37
38 redef class Byte
39 # Gives the length of the UTF-8 char starting with `self`
40 fun u8len: Int do
41 if self & 0b1000_0000u8 == 0u8 then
42 return 1
43 else if self & 0b1110_0000u8 == 0b1100_0000u8 then
44 return 2
45 else if self & 0b1111_0000u8 == 0b1110_0000u8 then
46 return 3
47 else if self & 0b1111_1000u8 == 0b1111_0000u8 then
48 return 4
49 else
50 return 1
51 end
52 end
53
54 # Is `self` a valid UTF-8 sequence start ?
55 #
56 # ~~~nit
57 # assert 0u8.is_valid_utf8_start
58 # assert 0xC0u8.is_valid_utf8_start
59 # assert 0xE0u8.is_valid_utf8_start
60 # assert 0xF0u8.is_valid_utf8_start
61 # ~~~
62 fun is_valid_utf8_start: Bool do
63 if self & 0x80u8 == 0u8 then return true
64 if self & 0b1110_0000u8 == 0b1100_0000u8 then return true
65 if self & 0b1111_0000u8 == 0b1110_0000u8 then return true
66 if self & 0b1111_1000u8 == 0b1111_0000u8 then return true
67 return false
68 end
69 end
70
71 redef class Int
72 # Returns the code_point from a utf16 surrogate pair
73 #
74 # assert 0xD83DDE02.from_utf16_surr == 0x1F602
75 fun from_utf16_surr: Int do
76 var hi = (self & 0xFFFF0000) >> 16
77 var lo = self & 0xFFFF
78 var cp = 0
79 cp += (hi - 0xD800) << 10
80 cp += lo - 0xDC00
81 cp += 0x10000
82 return cp
83 end
84 end
85
86 # C string `char *`
87 #
88 # Used as underlying implementation for `String` and some other `Text`.
89 extern class CString `{ char* `}
90 # Create a new `CString` with the capacity for `length` characters
91 new(length: Int) is intern
92
93 # Get a char* starting at `index`.
94 #
95 # WARNING: Unsafe for extern code, use only for temporary
96 # pointer manipulation purposes (e.g. write to file or such)
97 fun fast_cstring(index: Int): CString is intern
98
99 # Get char at `index`.
100 fun [](index: Int): Byte is intern
101
102 # Set char `item` at index.
103 fun []=(index: Int, item: Byte) is intern
104
105 # Copy `self` to `dest`.
106 fun copy_to(dest: CString, length: Int, from: Int, to: Int) is intern
107
108 redef fun ==(o) is intern do return is_same_instance(o)
109
110 redef fun !=(o) is intern do return not is_same_instance(o)
111
112 # Position of the first nul character.
113 fun cstring_length: Int
114 do
115 var l = 0
116 while self[l] != 0u8 do l += 1
117 return l
118 end
119
120 # Parse `self` as an Int.
121 fun atoi: Int is intern
122
123 # Parse `self` as a Float.
124 fun atof: Float `{ return atof(self); `}
125
126 # Gets the UTF-8 char at index `pos`
127 #
128 # Index is expressed in Unicode chars
129 #
130 # ~~~raw
131 # assert "かきく".as(FlatString).items.char_at(0) == 'か'
132 # ~~~
133 #
134 # If the char at position pos is an invalid Unicode char,
135 # the Unicode replacement character � (0xFFFD) will be used.
136 #
137 # ~~~raw
138 # assert "かきく".as(FlatString).items.char_at(1) == '�'
139 # ~~~
140 fun char_at(pos: Int): Char do
141 var c = self[pos]
142 if c & 0x80u8 == 0u8 then return c.ascii
143 var b = fetch_4_hchars(pos)
144 var ret = 0
145 if b & 0xC00000 != 0x800000 then return 0xFFFD.code_point
146 if b & 0xE0000000 == 0xC0000000 then
147 ret |= (b & 0x1F000000) >> 18
148 ret |= (b & 0x3F0000) >> 16
149 return ret.code_point
150 end
151 if not b & 0xC000 == 0x8000 then return 0xFFFD.code_point
152 if b & 0xF0000000 == 0xE0000000 then
153 ret |= (b & 0xF000000) >> 12
154 ret |= (b & 0x3F0000) >> 10
155 ret |= (b & 0x3F00) >> 8
156 return ret.code_point
157 end
158 if not b & 0xC0 == 0x80 then return 0xFFFD.code_point
159 if b & 0xF8000000 == 0xF0000000 then
160 ret |= (b.to_i & 0x7000000) >> 6
161 ret |= (b.to_i & 0x3F0000) >> 4
162 ret |= (b.to_i & 0x3F00) >> 2
163 ret |= b.to_i & 0x3F
164 return ret.code_point
165 end
166 return 0xFFFD.code_point
167 end
168
169 # Gets the byte index of char at position `n` in UTF-8 String
170 fun char_to_byte_index(n: Int): Int do return char_to_byte_index_cached(n, 0, 0)
171
172 # Gets the length of the character at position `pos` (1 if invalid sequence)
173 fun length_of_char_at(pos: Int): Int do
174 var c = self[pos]
175 if c & 0x80u8 == 0x00u8 then
176 return 1
177 else if c & 0xE0u8 == 0xC0u8 and self[pos + 1] & 0xC0u8 == 0x80u8 then
178 return 2
179 else if c & 0xF0u8 == 0xE0u8 and self[pos + 1] & 0xC0u8 == 0x80u8 and self[pos + 2] & 0xC0u8 == 0x80u8 then
180 return 3
181 else if c & 0xF8u8 == 0xF0u8 and self[pos + 1] & 0xC0u8 == 0x80u8 and self[pos + 2] & 0xC0u8 == 0x80u8 and self[pos + 3] & 0xC0u8 == 0x80u8 then
182 return 4
183 else
184 return 1
185 end
186 end
187
188 # Gets the byte index of char at position `n` in UTF-8 String
189 #
190 # `char_from` and `byte_from` are cached values to seek from.
191 #
192 # NOTE: char_from and byte_from are not guaranteed to be valid cache values
193 # It it up to the client to ensure the validity of the information
194 fun char_to_byte_index_cached(n, char_from, byte_from: Int): Int do
195 var ns_i = byte_from
196 var my_i = char_from
197
198 var dist = n - my_i
199
200 while dist > 0 do
201 while dist >= 4 do
202 var i = fetch_4_chars(ns_i)
203 if i & 0x80808080 != 0 then break
204 ns_i += 4
205 my_i += 4
206 dist -= 4
207 end
208 if dist == 0 then break
209 ns_i += length_of_char_at(ns_i)
210 my_i += 1
211 dist -= 1
212 end
213
214 while dist < 0 do
215 while dist <= -4 do
216 var i = fetch_4_chars(ns_i - 4)
217 if i & 0x80808080 != 0 then break
218 ns_i -= 4
219 my_i -= 4
220 dist += 4
221 end
222 if dist == 0 then break
223 ns_i = find_beginning_of_char_at(ns_i - 1)
224 my_i -= 1
225 dist += 1
226 end
227
228 return ns_i
229 end
230
231 # Gets the char index of byte at position `n` in a UTF-8 String
232 #
233 # `char_from` and `byte_from` are cached values to seek from.
234 #
235 # NOTE: char_from and byte_from are not guaranteed to be valid cache values
236 # It it up to the client to ensure the validity of the information
237 fun byte_to_char_index_cached(n, char_from, byte_from: Int): Int do
238 var ns_i = byte_from
239 var my_i = char_from
240
241 while ns_i < n do
242 ns_i += length_of_char_at(ns_i)
243 my_i += 1
244 end
245
246 while ns_i > n do
247 ns_i = find_beginning_of_char_at(ns_i - 1)
248 my_i -= 1
249 end
250
251 return my_i
252 end
253
254 # Returns the beginning position of the char at position `pos`
255 #
256 # If the char is invalid UTF-8, `pos` is returned as-is
257 #
258 # ~~~raw
259 # assert "abc".items.find_beginning_of_char_at(2) == 2
260 # assert "か".items.find_beginning_of_char_at(1) == 0
261 # assert [0x41u8, 233u8].to_s.items.find_beginning_of_char_at(1) == 1
262 # ~~~
263 fun find_beginning_of_char_at(pos: Int): Int do
264 var endpos = pos
265 var c = self[pos]
266 if c & 0x80u8 == 0x00u8 then return pos
267 while c & 0xC0u8 == 0x80u8 do
268 pos -= 1
269 c = self[pos]
270 end
271 var stpos = pos
272 if length_of_char_at(stpos) >= (endpos - stpos + 1) then return pos
273 return endpos
274 end
275
276 # Number of UTF-8 characters in `self` starting at `from`, for a length of `byte_length`
277 fun utf8_length(from, byte_length: Int): Int is intern do
278 var st = from
279 var ln = 0
280 while byte_length > 0 do
281 while byte_length >= 4 do
282 var i = fetch_4_chars(st)
283 if i & 0x80808080 != 0 then break
284 byte_length -= 4
285 st += 4
286 ln += 4
287 end
288 if byte_length == 0 then break
289 var cln = length_of_char_at(st)
290 st += cln
291 ln += 1
292 byte_length -= cln
293 end
294 return ln
295 end
296
297 # Fetch 4 chars in `self` at `pos`
298 fun fetch_4_chars(pos: Int): Int is intern `{ return (long)*((uint32_t*)(self+pos)); `}
299
300 # Fetch 4 chars in `self` at `pos`
301 fun fetch_4_hchars(pos: Int): Int is intern `{ return (long)be32toh(*((uint32_t*)(self+pos))); `}
302
303
304 # Right shifts `len` bytes of `self` from `sh` bytes starting at position `pos`
305 fun rshift(sh, len, pos: Int) do
306 copy_to(self, len, pos, pos + sh)
307 end
308
309 # Left shifts `len` bytes of `self` from `sh` bytes starting at position `pos`
310 fun lshift(sh, len, pos: Int) do
311 copy_to(self, len, pos, pos - sh)
312 end
313 end