Merge: Nitcatalog for nitpm
[nit.git] / lib / core / text / native.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # This file is free software, which comes along with NIT. This software is
4 # distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
5 # without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
6 # PARTICULAR PURPOSE. You can modify it is you want, provided this header
7 # is kept unaltered, and a notification of the changes is added.
8 # You are allowed to redistribute it and sell it, alone or is a part of
9 # another product.
10
11 # Native structures for text and bytes
12 module native
13
14 import kernel
15 import math
16 import fixed_ints
17
18 in "C" `{
19 #ifdef __linux__
20 #include <endian.h>
21 #endif
22 #ifdef __APPLE__
23 #include <libkern/OSByteOrder.h>
24 #define be32toh(x) OSSwapBigToHostInt32(x)
25 #endif
26 #ifdef _WIN32
27 #define be32toh(val) _byteswap_ulong(val)
28 #endif
29
30 #ifndef be32toh
31 #define be32toh(val) betoh32(val)
32 #endif
33 `}
34
35 redef class Byte
36 # Gives the length of the UTF-8 char starting with `self`
37 fun u8len: Int do
38 if self & 0b1000_0000u8 == 0u8 then
39 return 1
40 else if self & 0b1110_0000u8 == 0b1100_0000u8 then
41 return 2
42 else if self & 0b1111_0000u8 == 0b1110_0000u8 then
43 return 3
44 else if self & 0b1111_1000u8 == 0b1111_0000u8 then
45 return 4
46 else
47 return 1
48 end
49 end
50
51 # Is `self` a valid UTF-8 sequence start ?
52 #
53 # ~~~nit
54 # assert 0u8.is_valid_utf8_start
55 # assert 0xC0u8.is_valid_utf8_start
56 # assert 0xE0u8.is_valid_utf8_start
57 # assert 0xF0u8.is_valid_utf8_start
58 # ~~~
59 fun is_valid_utf8_start: Bool do
60 if self & 0x80u8 == 0u8 then return true
61 if self & 0b1110_0000u8 == 0b1100_0000u8 then return true
62 if self & 0b1111_0000u8 == 0b1110_0000u8 then return true
63 if self & 0b1111_1000u8 == 0b1111_0000u8 then return true
64 return false
65 end
66 end
67
68 redef class UInt32
69 # Returns the code_point from a utf16 surrogate pair
70 #
71 # assert 0xD83DDE02u32.from_utf16_surr == 0x1F602u32
72 fun from_utf16_surr: UInt32 do
73 var hi = (self & 0xFFFF0000u32) >> 16
74 var lo = self & 0xFFFFu32
75 var cp = 0u32
76 cp += (hi - 0xD800u32) << 10
77 cp += lo - 0xDC00u32
78 cp += 0x10000u32
79 return cp
80 end
81
82 # The character which code point (unicode-wise) is `self`
83 #
84 # assert 65u32.code_point == 'A'
85 # assert 10u32.code_point == '\n'
86 # assert 0x220Bu32.code_point == '∋'
87 fun code_point: Char `{ return self; `}
88 end
89
90 # C string `char *`
91 #
92 # Used as underlying implementation for `String` and some other `Text`.
93 extern class CString `{ char* `}
94 # Create a new `CString` with the capacity for `length` characters
95 new(length: Int) is intern
96
97 # Get a char* starting at `index`.
98 #
99 # WARNING: Unsafe for extern code, use only for temporary
100 # pointer manipulation purposes (e.g. write to file or such)
101 fun fast_cstring(index: Int): CString is intern
102
103 # Get char at `index`.
104 fun [](index: Int): Byte is intern
105
106 # Set char `item` at index.
107 fun []=(index: Int, item: Byte) is intern
108
109 # Copy `self` to `dest`.
110 fun copy_to(dest: CString, length: Int, from: Int, to: Int) is intern
111
112 redef fun ==(o) is intern do return is_same_instance(o)
113
114 redef fun !=(o) is intern do return not is_same_instance(o)
115
116 # Position of the first nul character.
117 fun cstring_length: Int
118 do
119 var l = 0
120 while self[l] != 0u8 do l += 1
121 return l
122 end
123
124 # Parse `self` as an Int.
125 fun atoi: Int is intern
126
127 # Parse `self` as a Float.
128 fun atof: Float `{ return atof(self); `}
129
130 # Gets the UTF-8 char at index `pos`
131 #
132 # Index is expressed in Unicode chars
133 #
134 # ~~~raw
135 # assert "かきく".as(FlatString).items.char_at(0) == 'か'
136 # ~~~
137 #
138 # If the char at position pos is an invalid Unicode char,
139 # the Unicode replacement character � (0xFFFD) will be used.
140 #
141 # ~~~raw
142 # assert "かきく".as(FlatString).items.char_at(1) == '�'
143 # ~~~
144 fun char_at(pos: Int): Char do
145 var c = self[pos]
146 if c & 0x80u8 == 0u8 then return c.ascii
147 var b = fetch_4_hchars(pos)
148 var ret = 0u32
149 if b & 0xC00000u32 != 0x800000u32 then return 0xFFFD.code_point
150 if b & 0xE0000000u32 == 0xC0000000u32 then
151 ret |= (b & 0x1F000000u32) >> 18
152 ret |= (b & 0x3F0000u32) >> 16
153 return ret.code_point
154 end
155 if not b & 0xC000u32 == 0x8000u32 then return 0xFFFD.code_point
156 if b & 0xF0000000u32 == 0xE0000000u32 then
157 ret |= (b & 0xF000000u32) >> 12
158 ret |= (b & 0x3F0000u32) >> 10
159 ret |= (b & 0x3F00u32) >> 8
160 return ret.code_point
161 end
162 if not b & 0xC0u32 == 0x80u32 then return 0xFFFD.code_point
163 if b & 0xF8000000u32 == 0xF0000000u32 then
164 ret |= (b & 0x7000000u32) >> 6
165 ret |= (b & 0x3F0000u32) >> 4
166 ret |= (b & 0x3F00u32) >> 2
167 ret |= b & 0x3Fu32
168 return ret.code_point
169 end
170 return 0xFFFD.code_point
171 end
172
173 # Gets the byte index of char at position `n` in UTF-8 String
174 fun char_to_byte_index(n: Int): Int do return char_to_byte_index_cached(n, 0, 0)
175
176 # Gets the length of the character at position `pos` (1 if invalid sequence)
177 fun length_of_char_at(pos: Int): Int do
178 var c = self[pos]
179 if c & 0x80u8 == 0x00u8 then
180 return 1
181 else if c & 0xE0u8 == 0xC0u8 and self[pos + 1] & 0xC0u8 == 0x80u8 then
182 return 2
183 else if c & 0xF0u8 == 0xE0u8 and self[pos + 1] & 0xC0u8 == 0x80u8 and self[pos + 2] & 0xC0u8 == 0x80u8 then
184 return 3
185 else if c & 0xF8u8 == 0xF0u8 and self[pos + 1] & 0xC0u8 == 0x80u8 and self[pos + 2] & 0xC0u8 == 0x80u8 and self[pos + 3] & 0xC0u8 == 0x80u8 then
186 return 4
187 else
188 return 1
189 end
190 end
191
192 # Gets the byte index of char at position `n` in UTF-8 String
193 #
194 # `char_from` and `byte_from` are cached values to seek from.
195 #
196 # NOTE: char_from and byte_from are not guaranteed to be valid cache values
197 # It it up to the client to ensure the validity of the information
198 fun char_to_byte_index_cached(n, char_from, byte_from: Int): Int do
199 var ns_i = byte_from
200 var my_i = char_from
201
202 var dist = n - my_i
203
204 while dist > 0 do
205 while dist >= 4 do
206 var i = fetch_4_chars(ns_i)
207 if i & 0x80808080u32 != 0u32 then break
208 ns_i += 4
209 my_i += 4
210 dist -= 4
211 end
212 if dist == 0 then break
213 ns_i += length_of_char_at(ns_i)
214 my_i += 1
215 dist -= 1
216 end
217
218 while dist < 0 do
219 while dist <= -4 do
220 var i = fetch_4_chars(ns_i - 4)
221 if i & 0x80808080u32 != 0u32 then break
222 ns_i -= 4
223 my_i -= 4
224 dist += 4
225 end
226 if dist == 0 then break
227 ns_i = find_beginning_of_char_at(ns_i - 1)
228 my_i -= 1
229 dist += 1
230 end
231
232 return ns_i
233 end
234
235 # Gets the char index of byte at position `n` in a UTF-8 String
236 #
237 # `char_from` and `byte_from` are cached values to seek from.
238 #
239 # NOTE: char_from and byte_from are not guaranteed to be valid cache values
240 # It it up to the client to ensure the validity of the information
241 fun byte_to_char_index_cached(n, char_from, byte_from: Int): Int do
242 var ns_i = byte_from
243 var my_i = char_from
244
245 while ns_i < n do
246 ns_i += length_of_char_at(ns_i)
247 my_i += 1
248 end
249
250 while ns_i > n do
251 ns_i = find_beginning_of_char_at(ns_i - 1)
252 my_i -= 1
253 end
254
255 return my_i
256 end
257
258 # Returns the beginning position of the char at position `pos`
259 #
260 # If the char is invalid UTF-8, `pos` is returned as-is
261 #
262 # ~~~raw
263 # assert "abc".items.find_beginning_of_char_at(2) == 2
264 # assert "か".items.find_beginning_of_char_at(1) == 0
265 # assert [0x41u8, 233u8].to_s.items.find_beginning_of_char_at(1) == 1
266 # ~~~
267 fun find_beginning_of_char_at(pos: Int): Int do
268 var endpos = pos
269 var c = self[pos]
270 if c & 0x80u8 == 0x00u8 then return pos
271 while c & 0xC0u8 == 0x80u8 do
272 pos -= 1
273 c = self[pos]
274 end
275 var stpos = pos
276 if length_of_char_at(stpos) >= (endpos - stpos + 1) then return pos
277 return endpos
278 end
279
280 # Number of UTF-8 characters in `self` starting at `from`, for a length of `byte_length`
281 fun utf8_length(from, byte_length: Int): Int is intern do
282 var st = from
283 var ln = 0
284 while byte_length > 0 do
285 while byte_length >= 4 do
286 var i = fetch_4_chars(st)
287 if i & 0x80808080u32 != 0u32 then break
288 byte_length -= 4
289 st += 4
290 ln += 4
291 end
292 if byte_length == 0 then break
293 var cln = length_of_char_at(st)
294 st += cln
295 ln += 1
296 byte_length -= cln
297 end
298 return ln
299 end
300
301 # Fetch 4 chars in `self` at `pos`
302 fun fetch_4_chars(pos: Int): UInt32 is intern `{ return *((uint32_t*)(self+pos)); `}
303
304 # Fetch 4 chars in `self` at `pos`
305 fun fetch_4_hchars(pos: Int): UInt32 is intern `{ return (uint32_t)be32toh(*((uint32_t*)(self+pos))); `}
306
307 # Right shifts `len` bytes of `self` from `sh` bytes starting at position `pos`
308 fun rshift(sh, len, pos: Int) do
309 copy_to(self, len, pos, pos + sh)
310 end
311
312 # Left shifts `len` bytes of `self` from `sh` bytes starting at position `pos`
313 fun lshift(sh, len, pos: Int) do
314 copy_to(self, len, pos, pos - sh)
315 end
316 end