Added services to NativeString and Byte
[nit.git] / lib / core / text / native.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # This file is free software, which comes along with NIT. This software is
4 # distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
5 # without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
6 # PARTICULAR PURPOSE. You can modify it is you want, provided this header
7 # is kept unaltered, and a notification of the changes is added.
8 # You are allowed to redistribute it and sell it, alone or is a part of
9 # another product.
10
11 # Native structures for text and bytes
12 module native
13
14 import kernel
15 import math
16
17 in "C" `{
18 #ifdef __linux__
19 #include <endian.h>
20 #endif
21 #ifdef __APPLE__
22 #include <libkern/OSByteOrder.h>
23 #define be32toh(x) OSSwapBigToHostInt32(x)
24 #endif
25
26 #ifdef __pnacl__
27 #define be16toh(val) (((val) >> 8) | ((val) << 8))
28 #define be32toh(val) ((be16toh((val) << 16) | (be16toh((val) >> 16))))
29 #endif
30 #ifndef be32toh
31 #define be32toh(val) betoh32(val)
32 #endif
33 `}
34
35 redef class Byte
36 # Gives the length of the UTF-8 char starting with `self`
37 fun u8len: Int do
38 if self & 0b1000_0000u8 == 0u8 then
39 return 1
40 else if self & 0b1110_0000u8 == 0b1100_0000u8 then
41 return 2
42 else if self & 0b1111_0000u8 == 0b1110_0000u8 then
43 return 3
44 else if self & 0b1111_1000u8 == 0b1111_0000u8 then
45 return 4
46 else
47 return 1
48 end
49 end
50
51 # Is `self` a valid UTF-8 sequence start ?
52 #
53 # ~~~nit
54 # assert 0u8.is_valid_utf8_start
55 # assert 0xC0u8.is_valid_utf8_start
56 # assert 0xE0u8.is_valid_utf8_start
57 # assert 0xF0u8.is_valid_utf8_start
58 # ~~~
59 fun is_valid_utf8_start: Bool do
60 if self & 0x80u8 == 0u8 then return true
61 if self & 0b1110_0000u8 == 0b1100_0000u8 then return true
62 if self & 0b1111_0000u8 == 0b1110_0000u8 then return true
63 if self & 0b1111_1000u8 == 0b1111_0000u8 then return true
64 return false
65 end
66 end
67
68 redef class Int
69 # Returns the code_point from a utf16 surrogate pair
70 #
71 # assert 0xD83DDE02.from_utf16_surr == 0x1F602
72 fun from_utf16_surr: Int do
73 var hi = (self & 0xFFFF0000) >> 16
74 var lo = self & 0xFFFF
75 var cp = 0
76 cp += (hi - 0xD800) << 10
77 cp += lo - 0xDC00
78 cp += 0x10000
79 return cp
80 end
81 end
82
83 # Native strings are simple C char *
84 extern class NativeString `{ char* `}
85 # Creates a new NativeString with a capacity of `length`
86 new(length: Int) is intern
87
88 # Returns a char* starting at `index`.
89 #
90 # WARNING: Unsafe for extern code, use only for temporary
91 # pointer manipulation purposes (e.g. write to file or such)
92 fun fast_cstring(index: Int): NativeString is intern
93
94 # Get char at `index`.
95 fun [](index: Int): Byte is intern
96
97 # Set char `item` at index.
98 fun []=(index: Int, item: Byte) is intern
99
100 # Copy `self` to `dest`.
101 fun copy_to(dest: NativeString, length: Int, from: Int, to: Int) is intern
102
103 redef fun ==(o) is intern do return is_same_instance(o)
104
105 redef fun !=(o) is intern do return not is_same_instance(o)
106
107 # Position of the first nul character.
108 fun cstring_length: Int
109 do
110 var l = 0
111 while self[l] != 0u8 do l += 1
112 return l
113 end
114
115 # Parse `self` as an Int.
116 fun atoi: Int is intern
117
118 # Parse `self` as a Float.
119 fun atof: Float `{ return atof(self); `}
120
121 # Gets the UTF-8 char at index `pos`
122 #
123 # Index is expressed in Unicode chars
124 #
125 # ~~~raw
126 # assert "かきく".as(FlatString).items.char_at(0) == 'か'
127 # ~~~
128 #
129 # If the char at position pos is an invalid Unicode char,
130 # the Unicode replacement character � (0xFFFD) will be used.
131 #
132 # ~~~raw
133 # assert "かきく".as(FlatString).items.char_at(1) == '�'
134 # ~~~
135 fun char_at(pos: Int): Char do
136 var c = self[pos]
137 if c & 0x80u8 == 0u8 then return c.ascii
138 var b = fetch_4_hchars(pos)
139 var ret = 0
140 if b & 0xC00000 != 0x800000 then return 0xFFFD.code_point
141 if b & 0xE0000000 == 0xC0000000 then
142 ret |= (b & 0x1F000000) >> 18
143 ret |= (b & 0x3F0000) >> 16
144 return ret.code_point
145 end
146 if not b & 0xC000 == 0x8000 then return 0xFFFD.code_point
147 if b & 0xF0000000 == 0xE0000000 then
148 ret |= (b & 0xF000000) >> 12
149 ret |= (b & 0x3F0000) >> 10
150 ret |= (b & 0x3F00) >> 8
151 return ret.code_point
152 end
153 if not b & 0xC0 == 0x80 then return 0xFFFD.code_point
154 if b & 0xF8000000 == 0xF0000000 then
155 ret |= (b.to_i & 0x7000000) >> 6
156 ret |= (b.to_i & 0x3F0000) >> 4
157 ret |= (b.to_i & 0x3F00) >> 2
158 ret |= b.to_i & 0x3F
159 return ret.code_point
160 end
161 return 0xFFFD.code_point
162 end
163
164 # Gets the byte index of char at position `n` in UTF-8 String
165 fun char_to_byte_index(n: Int): Int do return char_to_byte_index_cached(n, 0, 0)
166
167 # Gets the length of the character at position `pos` (1 if invalid sequence)
168 fun length_of_char_at(pos: Int): Int do
169 var c = self[pos]
170 if c & 0x80u8 == 0x00u8 then
171 return 1
172 else if c & 0xE0u8 == 0xC0u8 and self[pos + 1] & 0xC0u8 == 0x80u8 then
173 return 2
174 else if c & 0xF0u8 == 0xE0u8 and self[pos + 1] & 0xC0u8 == 0x80u8 and self[pos + 2] & 0xC0u8 == 0x80u8 then
175 return 3
176 else if c & 0xF8u8 == 0xF0u8 and self[pos + 1] & 0xC0u8 == 0x80u8 and self[pos + 2] & 0xC0u8 == 0x80u8 and self[pos + 3] & 0xC0u8 == 0x80u8 then
177 return 4
178 else
179 return 1
180 end
181 end
182
183 # Gets the byte index of char at position `n` in UTF-8 String
184 #
185 # `char_from` and `byte_from` are cached values to seek from.
186 #
187 # NOTE: char_from and byte_from are not guaranteed to be valid cache values
188 # It it up to the client to ensure the validity of the information
189 fun char_to_byte_index_cached(n, char_from, byte_from: Int): Int do
190 var ns_i = byte_from
191 var my_i = char_from
192
193 var dist = n - my_i
194
195 while dist > 0 do
196 while dist >= 4 do
197 var i = fetch_4_chars(ns_i)
198 if i & 0x80808080 != 0 then break
199 ns_i += 4
200 my_i += 4
201 dist -= 4
202 end
203 if dist == 0 then break
204 ns_i += length_of_char_at(ns_i)
205 my_i += 1
206 dist -= 1
207 end
208
209 while dist < 0 do
210 while dist <= -4 do
211 var i = fetch_4_chars(ns_i - 4)
212 if i & 0x80808080 != 0 then break
213 ns_i -= 4
214 my_i -= 4
215 dist += 4
216 end
217 if dist == 0 then break
218 ns_i = find_beginning_of_char_at(ns_i - 1)
219 my_i -= 1
220 dist += 1
221 end
222
223 return ns_i
224 end
225
226 # Gets the char index of byte at position `n` in a UTF-8 String
227 #
228 # `char_from` and `byte_from` are cached values to seek from.
229 #
230 # NOTE: char_from and byte_from are not guaranteed to be valid cache values
231 # It it up to the client to ensure the validity of the information
232 fun byte_to_char_index_cached(n, char_from, byte_from: Int): Int do
233 var ns_i = byte_from
234 var my_i = char_from
235
236 while ns_i < n do
237 ns_i += length_of_char_at(ns_i)
238 my_i += 1
239 end
240
241 while ns_i > n do
242 ns_i = find_beginning_of_char_at(ns_i - 1)
243 my_i -= 1
244 end
245
246 return my_i
247 end
248
249 # Returns the beginning position of the char at position `pos`
250 #
251 # If the char is invalid UTF-8, `pos` is returned as-is
252 #
253 # ~~~raw
254 # assert "abc".items.find_beginning_of_char_at(2) == 2
255 # assert "か".items.find_beginning_of_char_at(1) == 0
256 # assert [0x41u8, 233u8].to_s.items.find_beginning_of_char_at(1) == 1
257 # ~~~
258 fun find_beginning_of_char_at(pos: Int): Int do
259 var endpos = pos
260 var c = self[pos]
261 if c & 0x80u8 == 0x00u8 then return pos
262 while c & 0xC0u8 == 0x80u8 do
263 pos -= 1
264 c = self[pos]
265 end
266 var stpos = pos
267 if length_of_char_at(stpos) >= (endpos - stpos + 1) then return pos
268 return endpos
269 end
270
271 # Number of UTF-8 characters in `self` starting at `from`, for a length of `bytelen`
272 fun utf8_length(from, bytelen: Int): Int is intern do
273 var st = from
274 var ln = 0
275 while bytelen > 0 do
276 while bytelen >= 4 do
277 var i = fetch_4_chars(st)
278 if i & 0x80808080 != 0 then break
279 bytelen -= 4
280 st += 4
281 ln += 4
282 end
283 if bytelen == 0 then break
284 var cln = length_of_char_at(st)
285 st += cln
286 ln += 1
287 bytelen -= cln
288 end
289 return ln
290 end
291
292 # Fetch 4 chars in `self` at `pos`
293 fun fetch_4_chars(pos: Int): Int is intern `{ return (long)*((uint32_t*)(self+pos)); `}
294
295 # Fetch 4 chars in `self` at `pos`
296 fun fetch_4_hchars(pos: Int): Int is intern `{ return (long)be32toh(*((uint32_t*)(self+pos))); `}
297
298
299 # Right shifts `len` bytes of `self` from `sh` bytes starting at position `pos`
300 fun rshift(sh, len, pos: Int) do
301 copy_to(self, len, pos, pos + sh)
302 end
303
304 # Left shifts `len` bytes of `self` from `sh` bytes starting at position `pos`
305 fun lshift(sh, len, pos: Int) do
306 copy_to(self, len, pos, pos - sh)
307 end
308 end