lib/core: Fixed bug in `BytesIterator::init with_buffer`
[nit.git] / lib / core / bytes.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 # Services for byte streams and arrays
16 module bytes
17
18 import kernel
19 import collection::array
20 intrude import text::flat
21
22 redef class Byte
23 # Is `self` a valid hexadecimal digit (in ASCII)
24 #
25 # ~~~nit
26 # intrude import core::bytes
27 # assert not '/'.ascii.to_b.is_valid_hexdigit
28 # assert '0'.ascii.to_b.is_valid_hexdigit
29 # assert '9'.ascii.to_b.is_valid_hexdigit
30 # assert not ':'.ascii.to_b.is_valid_hexdigit
31 # assert not '@'.ascii.to_b.is_valid_hexdigit
32 # assert 'A'.ascii.to_b.is_valid_hexdigit
33 # assert 'F'.ascii.to_b.is_valid_hexdigit
34 # assert not 'G'.ascii.to_b.is_valid_hexdigit
35 # assert not '`'.ascii.to_b.is_valid_hexdigit
36 # assert 'a'.ascii.to_b.is_valid_hexdigit
37 # assert 'f'.ascii.to_b.is_valid_hexdigit
38 # assert not 'g'.ascii.to_b.is_valid_hexdigit
39 # ~~~
40 private fun is_valid_hexdigit: Bool do
41 return (self >= 0x30u8 and self <= 0x39u8) or
42 (self >= 0x41u8 and self <= 0x46u8) or
43 (self >= 0x61u8 and self <= 0x66u8)
44 end
45
46 # `self` as a hexdigit to its byte value
47 #
48 # ~~~nit
49 # intrude import core::bytes
50 # assert 0x39u8.hexdigit_to_byteval == 0x09u8
51 # assert 0x43u8.hexdigit_to_byteval == 0x0Cu8
52 # ~~~
53 #
54 # REQUIRE: `self.is_valid_hexdigit`
55 private fun hexdigit_to_byteval: Byte do
56 if self >= 0x30u8 and self <= 0x39u8 then
57 return self - 0x30u8
58 else if self >= 0x41u8 and self <= 0x46u8 then
59 return self - 0x37u8
60 else if self >= 0x61u8 and self <= 0x66u8 then
61 return self - 0x57u8
62 end
63 # Happens only if the requirement is not met.
64 # i.e. this abort is here to please the compiler
65 abort
66 end
67 end
68
69 # A buffer containing Byte-manipulation facilities
70 #
71 # Uses Copy-On-Write when persisted
72 class Bytes
73 super AbstractArray[Byte]
74
75 # A NativeString being a char*, it can be used as underlying representation here.
76 private var items: NativeString
77
78 # Number of bytes in the array
79 redef var length
80
81 # Capacity of the array
82 private var capacity: Int
83
84 # Has this buffer been persisted (to_s'd)?
85 #
86 # Used for Copy-On-Write
87 private var persisted = false
88
89 # var b = new Bytes.empty
90 # assert b.to_s == ""
91 init empty do
92 var ns = new NativeString(0)
93 init(ns, 0, 0)
94 end
95
96 # Init a `Bytes` with capacity `cap`
97 init with_capacity(cap: Int) do
98 var ns = new NativeString(cap)
99 init(ns, 0, cap)
100 end
101
102 redef fun is_empty do return length != 0
103
104 # var b = new Bytes.empty
105 # b.add 101u8
106 # assert b[0] == 101u8
107 redef fun [](i) do
108 assert i >= 0
109 assert i < length
110 return items[i]
111 end
112
113 # var b = new Bytes.with_capacity(1)
114 # b[0] = 101u8
115 # assert b.to_s == "e"
116 redef fun []=(i, v) do
117 if persisted then regen
118 assert i >= 0
119 assert i <= length
120 if i == length then add(v)
121 items[i] = v
122 end
123
124 # var b = new Bytes.empty
125 # b.add 101u8
126 # assert b.to_s == "e"
127 redef fun add(c) do
128 if persisted then regen
129 if length >= capacity then
130 enlarge(length)
131 end
132 items[length] = c
133 length += 1
134 end
135
136 # var b = new Bytes.empty
137 # b.append([104u8, 101u8, 108u8, 108u8, 111u8])
138 # assert b.to_s == "hello"
139 redef fun append(arr) do
140 if arr isa Bytes then
141 append_ns(arr.items, arr.length)
142 else
143 for i in arr do add i
144 end
145 end
146
147 # var b = new Bytes.empty
148 # b.append([0x41u8, 0x41u8, 0x18u8])
149 # b.pop
150 # assert b.to_s == "AA"
151 redef fun pop do
152 assert length >= 1
153 length -= 1
154 return items[length]
155 end
156
157 redef fun clear do length = 0
158
159 # Regenerates the buffer, necessary when it was persisted
160 private fun regen do
161 var nns = new NativeString(capacity)
162 items.copy_to(nns, length, 0, 0)
163 persisted = false
164 end
165
166 # Appends the `ln` first bytes of `ns` to self
167 fun append_ns(ns: NativeString, ln: Int) do
168 if persisted then regen
169 var nlen = length + ln
170 if nlen > capacity then enlarge(nlen)
171 ns.copy_to(items, ln, 0, length)
172 length += ln
173 end
174
175 # Appends `ln` bytes from `ns` starting at index `from` to self
176 fun append_ns_from(ns: NativeString, ln, from: Int) do
177 if persisted then regen
178 var nlen = length + ln
179 if nlen > capacity then enlarge(nlen)
180 ns.copy_to(items, ln, from, length)
181 length += ln
182 end
183
184 redef fun enlarge(sz) do
185 if capacity >= sz then return
186 persisted = false
187 while capacity < sz do capacity = capacity * 2 + 2
188 var ns = new NativeString(capacity)
189 items.copy_to(ns, length, 0, 0)
190 items = ns
191 end
192
193 redef fun to_s do
194 persisted = true
195 var b = self
196 if not is_utf8 then
197 b = clean_utf8
198 persisted = false
199 end
200 return new FlatString.with_infos(b.items, b.length, 0, b.length -1)
201 end
202
203 redef fun iterator do return new BytesIterator.with_buffer(self)
204
205 # Is the byte collection valid UTF-8 ?
206 fun is_utf8: Bool do
207 var charst = once [0x80u8, 0u8, 0xE0u8, 0xC0u8, 0xF0u8, 0xE0u8, 0xF8u8, 0xF0u8]
208 var lobounds = once [0, 0x80, 0x800, 0x10000]
209 var hibounds = once [0x7F, 0x7FF, 0xFFFF, 0x10FFFF]
210 var pos = 0
211 var len = length
212 var mits = items
213 while pos < len do
214 var nxst = mits.length_of_char_at(pos)
215 var charst_index = (nxst - 1) * 2
216 if mits[pos] & charst[charst_index] == charst[charst_index + 1] then
217 var c = mits.char_at(pos)
218 var cp = c.ascii
219 if cp <= hibounds[nxst - 1] and cp >= lobounds[nxst - 1] then
220 if cp >= 0xD800 and cp <= 0xDFFF or
221 cp == 0xFFFE or cp == 0xFFFF then return false
222 else
223 return false
224 end
225 else
226 return false
227 end
228 pos += nxst
229 end
230 return true
231 end
232
233 # Cleans the bytes of `self` to be UTF-8 compliant
234 private fun clean_utf8: Bytes do
235 var charst = once [0x80u8, 0u8, 0xE0u8, 0xC0u8, 0xF0u8, 0xE0u8, 0xF8u8, 0xF0u8]
236 var badchar = once [0xEFu8, 0xBFu8, 0xBDu8]
237 var lobounds = once [0, 0x80, 0x800, 0x10000]
238 var hibounds = once [0x7F, 0x7FF, 0xFFFF, 0x10FFFF]
239 var pos = 0
240 var len = length
241 var ret = new Bytes.with_capacity(len)
242 var mits = items
243 while pos < len do
244 var nxst = mits.length_of_char_at(pos)
245 var charst_index = (nxst - 1) * 2
246 if mits[pos] & charst[charst_index] == charst[charst_index + 1] then
247 var c = mits.char_at(pos)
248 var cp = c.ascii
249 if cp <= hibounds[nxst - 1] and cp >= lobounds[nxst - 1] then
250 if cp >= 0xD800 and cp <= 0xDFFF or
251 cp == 0xFFFE or cp == 0xFFFF then
252 ret.append badchar
253 pos += 1
254 else
255 var pend = pos + nxst
256 for i in [pos .. pend[ do ret.add mits[i]
257 pos += nxst
258 end
259 else
260 ret.append badchar
261 pos += 1
262 end
263 else
264 ret.append badchar
265 pos += 1
266 end
267 end
268 return ret
269 end
270 end
271
272 private class BytesIterator
273 super IndexedIterator[Byte]
274
275 var tgt: NativeString
276
277 redef var index
278
279 var max: Int
280
281 init with_buffer(b: Bytes) do init(b.items, 0, b.length)
282
283 redef fun is_ok do return index < max
284
285 redef fun next do index += 1
286
287 redef fun item do return tgt[index]
288 end
289
290 redef class Text
291 # Returns a mutable copy of `self`'s bytes
292 #
293 # ~~~nit
294 # assert "String".to_bytes isa Bytes
295 # assert "String".to_bytes == [83u8, 116u8, 114u8, 105u8, 110u8, 103u8]
296 # ~~~
297 fun to_bytes: Bytes do
298 var b = new Bytes.with_capacity(bytelen)
299 append_to_bytes b
300 return b
301 end
302
303 # Is `self` a valid hexdigest ?
304 #
305 # assert "0B1d3F".is_valid_hexdigest
306 # assert not "5G".is_valid_hexdigest
307 fun is_valid_hexdigest: Bool do
308 for i in bytes do if not i.is_valid_hexdigit then return false
309 return true
310 end
311
312 # Appends `self.bytes` to `b`
313 fun append_to_bytes(b: Bytes) do
314 for s in substrings do
315 var from = if s isa FlatString then s.first_byte else 0
316 b.append_ns_from(s.items, s.bytelen, from)
317 end
318 end
319
320 # Returns a new `Bytes` instance with the digest as content
321 #
322 # assert "0B1F4D".hexdigest_to_bytes == [0x0Bu8, 0x1Fu8, 0x4Du8]
323 #
324 # REQUIRE: `self` is a valid hexdigest and hexdigest.length % 2 == 0
325 fun hexdigest_to_bytes: Bytes do
326 var b = bytes
327 var pos = 0
328 var max = bytelen
329 var ret = new Bytes.with_capacity(max / 2)
330 while pos < max do
331 ret.add((b[pos].hexdigit_to_byteval << 4) |
332 b[pos + 1].hexdigit_to_byteval)
333 pos += 2
334 end
335 return ret
336 end
337 end
338
339 redef class FlatText
340 redef fun append_to_bytes(b) do
341 var from = if self isa FlatString then first_byte else 0
342 b.append_ns_from(items, bytelen, from)
343 end
344 end
345
346 redef class NativeString
347 # Creates a new `Bytes` object from `self` with `strlen` as length
348 fun to_bytes: Bytes do
349 var len = cstring_length
350 return new Bytes(self, len, len)
351 end
352 end