lib: Perfized `to_hex` and have it work anywhere in a `Text`
[nit.git] / lib / core / text / native.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # This file is free software, which comes along with NIT. This software is
4 # distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
5 # without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
6 # PARTICULAR PURPOSE. You can modify it is you want, provided this header
7 # is kept unaltered, and a notification of the changes is added.
8 # You are allowed to redistribute it and sell it, alone or is a part of
9 # another product.
10
11 # Native structures for text and bytes
12 module native
13
14 import kernel
15 import math
16
17 redef class Byte
18 # Gives the length of the UTF-8 char starting with `self`
19 fun u8len: Int do
20 if self & 0b1000_0000u8 == 0u8 then
21 return 1
22 else if self & 0b1110_0000u8 == 0b1100_0000u8 then
23 return 2
24 else if self & 0b1111_0000u8 == 0b1110_0000u8 then
25 return 3
26 else if self & 0b1111_1000u8 == 0b1111_0000u8 then
27 return 4
28 else
29 return 1
30 end
31 end
32 end
33
34 redef class Int
35 # Returns the code_point from a utf16 surrogate pair
36 #
37 # assert 0xD83DDE02.from_utf16_surr == 0x1F602
38 fun from_utf16_surr: Int do
39 var hi = (self & 0xFFFF0000) >> 16
40 var lo = self & 0xFFFF
41 var cp = 0
42 cp += (hi - 0xD800) << 10
43 cp += lo - 0xDC00
44 cp += 0x10000
45 return cp
46 end
47 end
48
49 # Native strings are simple C char *
50 extern class NativeString `{ char* `}
51 # Creates a new NativeString with a capacity of `length`
52 new(length: Int) is intern
53
54 # Returns a char* starting at `index`.
55 #
56 # WARNING: Unsafe for extern code, use only for temporary
57 # pointer manipulation purposes (e.g. write to file or such)
58 fun fast_cstring(index: Int): NativeString is intern
59
60 # Get char at `index`.
61 fun [](index: Int): Byte is intern
62
63 # Set char `item` at index.
64 fun []=(index: Int, item: Byte) is intern
65
66 # Copy `self` to `dest`.
67 fun copy_to(dest: NativeString, length: Int, from: Int, to: Int) is intern
68
69 # Position of the first nul character.
70 fun cstring_length: Int
71 do
72 var l = 0
73 while self[l] != 0u8 do l += 1
74 return l
75 end
76
77 # Parse `self` as an Int.
78 fun atoi: Int is intern
79
80 # Parse `self` as a Float.
81 fun atof: Float `{ return atof(self); `}
82
83 # Gets the UTF-8 char at index `pos`
84 #
85 # Index is expressed in Unicode chars
86 #
87 # ~~~raw
88 # assert "かきく".as(FlatString).items.char_at(0) == 'か'
89 # ~~~
90 #
91 # If the char at position pos is an invalid Unicode char,
92 # the Unicode replacement character � (0xFFFD) will be used.
93 #
94 # ~~~raw
95 # assert "かきく".as(FlatString).items.char_at(1) == '�'
96 # ~~~
97 fun char_at(pos: Int): Char `{
98 char c = self[pos];
99 if((c & 0x80) == 0x00) return (uint32_t)c;
100 if(((c & 0xE0) == 0xC0) && ((self[pos + 1] & 0xC0) == 0x80)) return ((((uint32_t)c) & 0x1F) << 6) + ((((uint32_t)self[pos + 1] & 0x3F)));
101 if(((c & 0xF0) == 0xE0) && ((self[pos + 1] & 0xC0) == 0x80) && ((self[pos + 2] & 0xC0) == 0x80)) return ((((uint32_t)c) & 0xF) << 12) + ((((uint32_t)self[pos + 1]) & 0x3F) << 6) + ((((uint32_t)self[pos + 2] & 0x3F)));
102 if(((c & 0xF8) == 0xF0) && ((self[pos + 1] & 0xC0) == 0x80) && ((self[pos + 2] & 0xC0) == 0x80) && ((self[pos + 3] & 0xC0) == 0x80)) return ((((uint32_t)c) & 0x7) << 18) + ((((uint32_t)self[pos + 1]) & 0x3F) << 12) + ((((uint32_t)self[pos + 2]) & 0x3F) << 6) + ((((uint32_t)self[pos + 3] & 0x3F)));
103 return 0xFFFD;
104 `}
105
106 # Gets the byte index of char at position `n` in UTF-8 String
107 fun char_to_byte_index(n: Int): Int do return char_to_byte_index_cached(n, 0, 0)
108
109 # Gets the length of the character at position `pos` (1 if invalid sequence)
110 fun length_of_char_at(pos: Int): Int do
111 var c = self[pos]
112 if c & 0x80u8 == 0x00u8 then
113 return 1
114 else if c & 0xE0u8 == 0xC0u8 and self[pos + 1] & 0xC0u8 == 0x80u8 then
115 return 2
116 else if c & 0xF0u8 == 0xE0u8 and self[pos + 1] & 0xC0u8 == 0x80u8 and self[pos + 2] & 0xC0u8 == 0x80u8 then
117 return 3
118 else if c & 0xF8u8 == 0xF0u8 and self[pos + 1] & 0xC0u8 == 0x80u8 and self[pos + 2] & 0xC0u8 == 0x80u8 and self[pos + 3] & 0xC0u8 == 0x80u8 then
119 return 4
120 else
121 return 1
122 end
123 end
124
125 # Gets the byte index of char at position `n` in UTF-8 String
126 #
127 # `char_from` and `byte_from` are cached values to seek from.
128 #
129 # NOTE: char_from and byte_from are not guaranteed to be valid cache values
130 # It it up to the client to ensure the validity of the information
131 fun char_to_byte_index_cached(n, char_from, byte_from: Int): Int do
132 var ns_i = byte_from
133 var my_i = char_from
134
135 while my_i < n do
136 ns_i += length_of_char_at(ns_i)
137 my_i += 1
138 end
139
140 while my_i > n do
141 ns_i = find_beginning_of_char_at(ns_i - 1)
142 my_i -= 1
143 end
144
145 return ns_i
146 end
147
148 # Gets the char index of byte at position `n` in a UTF-8 String
149 #
150 # `char_from` and `byte_from` are cached values to seek from.
151 #
152 # NOTE: char_from and byte_from are not guaranteed to be valid cache values
153 # It it up to the client to ensure the validity of the information
154 fun byte_to_char_index_cached(n, char_from, byte_from: Int): Int do
155 var ns_i = byte_from
156 var my_i = char_from
157
158 while ns_i < n do
159 ns_i += length_of_char_at(ns_i)
160 my_i += 1
161 end
162
163 while ns_i > n do
164 ns_i = find_beginning_of_char_at(ns_i - 1)
165 my_i -= 1
166 end
167
168 return my_i
169 end
170
171 # Returns the beginning position of the char at position `pos`
172 #
173 # If the char is invalid UTF-8, `pos` is returned as-is
174 #
175 # ~~~raw
176 # assert "abc".items.find_beginning_of_char_at(2) == 2
177 # assert "か".items.find_beginning_of_char_at(1) == 0
178 # assert [0x41u8, 233u8].to_s.items.find_beginning_of_char_at(1) == 1
179 # ~~~
180 fun find_beginning_of_char_at(pos: Int): Int do
181 var endpos = pos
182 var c = self[pos]
183 while c & 0xC0u8 == 0x80u8 do
184 pos -= 1
185 c = self[pos]
186 end
187 var stpos = pos
188 if length_of_char_at(stpos) >= (endpos - stpos + 1) then return pos
189 return endpos
190 end
191
192 # Number of UTF-8 characters in `self` between positions `from` and `to`
193 fun utf8_length(from, to: Int): Int do
194 var st = from
195 var lst = to
196 var ln = 0
197 while st <= lst do
198 st += length_of_char_at(st)
199 ln += 1
200 end
201 return ln
202 end
203 end