lib/core: Improve speed of an indexed access in a UTF-8 `Text` entity
[nit.git] / lib / core / text / native.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # This file is free software, which comes along with NIT. This software is
4 # distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
5 # without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
6 # PARTICULAR PURPOSE. You can modify it is you want, provided this header
7 # is kept unaltered, and a notification of the changes is added.
8 # You are allowed to redistribute it and sell it, alone or is a part of
9 # another product.
10
11 # Native structures for text and bytes
12 module native
13
14 import kernel
15 import math
16
17 in "C" `{
18 #ifdef __linux__
19 #include <endian.h>
20 #endif
21 #ifdef __APPLE__
22 #include <libkern/OSByteOrder.h>
23 #define be32toh(x) OSSwapBigToHostInt32(x)
24 #endif
25
26 #ifdef __pnacl__
27 #define be16toh(val) (((val) >> 8) | ((val) << 8))
28 #define be32toh(val) ((be16toh((val) << 16) | (be16toh((val) >> 16))))
29 #endif
30 #ifndef be32toh
31 #define be32toh(val) betoh32(val)
32 #endif
33 `}
34
35 redef class Byte
36 # Gives the length of the UTF-8 char starting with `self`
37 fun u8len: Int do
38 if self & 0b1000_0000u8 == 0u8 then
39 return 1
40 else if self & 0b1110_0000u8 == 0b1100_0000u8 then
41 return 2
42 else if self & 0b1111_0000u8 == 0b1110_0000u8 then
43 return 3
44 else if self & 0b1111_1000u8 == 0b1111_0000u8 then
45 return 4
46 else
47 return 1
48 end
49 end
50 end
51
52 redef class Int
53 # Returns the code_point from a utf16 surrogate pair
54 #
55 # assert 0xD83DDE02.from_utf16_surr == 0x1F602
56 fun from_utf16_surr: Int do
57 var hi = (self & 0xFFFF0000) >> 16
58 var lo = self & 0xFFFF
59 var cp = 0
60 cp += (hi - 0xD800) << 10
61 cp += lo - 0xDC00
62 cp += 0x10000
63 return cp
64 end
65 end
66
67 # Native strings are simple C char *
68 extern class NativeString `{ char* `}
69 # Creates a new NativeString with a capacity of `length`
70 new(length: Int) is intern
71
72 # Returns a char* starting at `index`.
73 #
74 # WARNING: Unsafe for extern code, use only for temporary
75 # pointer manipulation purposes (e.g. write to file or such)
76 fun fast_cstring(index: Int): NativeString is intern
77
78 # Get char at `index`.
79 fun [](index: Int): Byte is intern
80
81 # Set char `item` at index.
82 fun []=(index: Int, item: Byte) is intern
83
84 # Copy `self` to `dest`.
85 fun copy_to(dest: NativeString, length: Int, from: Int, to: Int) is intern
86
87 # Position of the first nul character.
88 fun cstring_length: Int
89 do
90 var l = 0
91 while self[l] != 0u8 do l += 1
92 return l
93 end
94
95 # Parse `self` as an Int.
96 fun atoi: Int is intern
97
98 # Parse `self` as a Float.
99 fun atof: Float `{ return atof(self); `}
100
101 # Gets the UTF-8 char at index `pos`
102 #
103 # Index is expressed in Unicode chars
104 #
105 # ~~~raw
106 # assert "かきく".as(FlatString).items.char_at(0) == 'か'
107 # ~~~
108 #
109 # If the char at position pos is an invalid Unicode char,
110 # the Unicode replacement character � (0xFFFD) will be used.
111 #
112 # ~~~raw
113 # assert "かきく".as(FlatString).items.char_at(1) == '�'
114 # ~~~
115 fun char_at(pos: Int): Char do
116 var c = self[pos]
117 if c & 0x80u8 == 0u8 then return c.ascii
118 var b = fetch_4_hchars(pos)
119 var ret = 0
120 if b & 0xC00000 != 0x800000 then return 0xFFFD.code_point
121 if b & 0xE0000000 == 0xC0000000 then
122 ret |= (b & 0x1F000000) >> 18
123 ret |= (b & 0x3F0000) >> 16
124 return ret.code_point
125 end
126 if not b & 0xC000 == 0x8000 then return 0xFFFD.code_point
127 if b & 0xF0000000 == 0xE0000000 then
128 ret |= (b & 0xF000000) >> 12
129 ret |= (b & 0x3F0000) >> 10
130 ret |= (b & 0x3F00) >> 8
131 return ret.code_point
132 end
133 if not b & 0xC0 == 0x80 then return 0xFFFD.code_point
134 if b & 0xF8000000 == 0xF0000000 then
135 ret |= (b.to_i & 0x7000000) >> 6
136 ret |= (b.to_i & 0x3F0000) >> 4
137 ret |= (b.to_i & 0x3F00) >> 2
138 ret |= b.to_i & 0x3F
139 return ret.code_point
140 end
141 return 0xFFFD.code_point
142 end
143
144 # Gets the byte index of char at position `n` in UTF-8 String
145 fun char_to_byte_index(n: Int): Int do return char_to_byte_index_cached(n, 0, 0)
146
147 # Gets the length of the character at position `pos` (1 if invalid sequence)
148 fun length_of_char_at(pos: Int): Int do
149 var c = self[pos]
150 if c & 0x80u8 == 0x00u8 then
151 return 1
152 else if c & 0xE0u8 == 0xC0u8 and self[pos + 1] & 0xC0u8 == 0x80u8 then
153 return 2
154 else if c & 0xF0u8 == 0xE0u8 and self[pos + 1] & 0xC0u8 == 0x80u8 and self[pos + 2] & 0xC0u8 == 0x80u8 then
155 return 3
156 else if c & 0xF8u8 == 0xF0u8 and self[pos + 1] & 0xC0u8 == 0x80u8 and self[pos + 2] & 0xC0u8 == 0x80u8 and self[pos + 3] & 0xC0u8 == 0x80u8 then
157 return 4
158 else
159 return 1
160 end
161 end
162
163 # Gets the byte index of char at position `n` in UTF-8 String
164 #
165 # `char_from` and `byte_from` are cached values to seek from.
166 #
167 # NOTE: char_from and byte_from are not guaranteed to be valid cache values
168 # It it up to the client to ensure the validity of the information
169 fun char_to_byte_index_cached(n, char_from, byte_from: Int): Int do
170 var ns_i = byte_from
171 var my_i = char_from
172
173 var dist = n - my_i
174
175 while dist > 0 do
176 while dist >= 4 do
177 var i = fetch_4_chars(ns_i)
178 if i & 0x80808080 != 0 then break
179 ns_i += 4
180 my_i += 4
181 dist -= 4
182 end
183 if dist == 0 then break
184 ns_i += length_of_char_at(ns_i)
185 my_i += 1
186 dist -= 1
187 end
188
189 while dist < 0 do
190 while dist <= -4 do
191 var i = fetch_4_chars(ns_i - 4)
192 if i & 0x80808080 != 0 then break
193 ns_i -= 4
194 my_i -= 4
195 dist += 4
196 end
197 if dist == 0 then break
198 ns_i = find_beginning_of_char_at(ns_i - 1)
199 my_i -= 1
200 dist += 1
201 end
202
203 return ns_i
204 end
205
206 # Gets the char index of byte at position `n` in a UTF-8 String
207 #
208 # `char_from` and `byte_from` are cached values to seek from.
209 #
210 # NOTE: char_from and byte_from are not guaranteed to be valid cache values
211 # It it up to the client to ensure the validity of the information
212 fun byte_to_char_index_cached(n, char_from, byte_from: Int): Int do
213 var ns_i = byte_from
214 var my_i = char_from
215
216 while ns_i < n do
217 ns_i += length_of_char_at(ns_i)
218 my_i += 1
219 end
220
221 while ns_i > n do
222 ns_i = find_beginning_of_char_at(ns_i - 1)
223 my_i -= 1
224 end
225
226 return my_i
227 end
228
229 # Returns the beginning position of the char at position `pos`
230 #
231 # If the char is invalid UTF-8, `pos` is returned as-is
232 #
233 # ~~~raw
234 # assert "abc".items.find_beginning_of_char_at(2) == 2
235 # assert "か".items.find_beginning_of_char_at(1) == 0
236 # assert [0x41u8, 233u8].to_s.items.find_beginning_of_char_at(1) == 1
237 # ~~~
238 fun find_beginning_of_char_at(pos: Int): Int do
239 var endpos = pos
240 var c = self[pos]
241 while c & 0xC0u8 == 0x80u8 do
242 pos -= 1
243 c = self[pos]
244 end
245 var stpos = pos
246 if length_of_char_at(stpos) >= (endpos - stpos + 1) then return pos
247 return endpos
248 end
249
250 # Number of UTF-8 characters in `self` between positions `from` and `to`
251 fun utf8_length(from, to: Int): Int do
252 var st = from
253 var lst = to
254 var ln = 0
255 while st <= lst do
256 st += length_of_char_at(st)
257 ln += 1
258 end
259 return ln
260 end
261
262 # Fetch 4 chars in `self` at `pos`
263 fun fetch_4_chars(pos: Int): Int is intern do return fetch_4_ffi(pos)
264
265 # Fetch 4 chars in `self` at `pos`
266 fun fetch_4_hchars(pos: Int): Int is intern do return fetch_4h_ffi(pos)
267
268 # FIXME: To remove when bootstrap supports PR #1898
269 private fun fetch_4_ffi(pos: Int): Int `{ return (long)*((uint32_t*)(self+pos)); `}
270 private fun fetch_4h_ffi(pos: Int): Int `{ return (long)be32toh(*((uint32_t*)(self+pos))); `}
271 end