59c4c5f5bcc5e57fbbedbf17de8b912b7b2934da
[nit.git] / lib / core / bytes.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 # Services for byte streams and arrays
16 module bytes
17
18 import kernel
19 import collection::array
20 intrude import text::flat
21
22 # A buffer containing Byte-manipulation facilities
23 #
24 # Uses Copy-On-Write when persisted
25 class Bytes
26 super AbstractArray[Byte]
27
28 # A NativeString being a char*, it can be used as underlying representation here.
29 private var items: NativeString
30
31 # Number of bytes in the array
32 redef var length
33
34 # Capacity of the array
35 private var capacity: Int
36
37 # Has this buffer been persisted (to_s'd)?
38 #
39 # Used for Copy-On-Write
40 private var persisted = false
41
42 # var b = new Bytes.empty
43 # assert b.to_s == ""
44 init empty do
45 var ns = new NativeString(0)
46 init(ns, 0, 0)
47 end
48
49 # Init a `Bytes` with capacity `cap`
50 init with_capacity(cap: Int) do
51 var ns = new NativeString(cap)
52 init(ns, 0, cap)
53 end
54
55 redef fun is_empty do return length != 0
56
57 # var b = new Bytes.empty
58 # b.add 101u8
59 # assert b[0] == 101u8
60 redef fun [](i) do
61 assert i >= 0
62 assert i < length
63 return items[i]
64 end
65
66 # var b = new Bytes.with_capacity(1)
67 # b[0] = 101u8
68 # assert b.to_s == "e"
69 redef fun []=(i, v) do
70 if persisted then regen
71 assert i >= 0
72 assert i <= length
73 if i == length then add(v)
74 items[i] = v
75 end
76
77 # var b = new Bytes.empty
78 # b.add 101u8
79 # assert b.to_s == "e"
80 redef fun add(c) do
81 if persisted then regen
82 if length >= capacity then
83 enlarge(length)
84 end
85 items[length] = c
86 length += 1
87 end
88
89 # var b = new Bytes.empty
90 # b.append([104u8, 101u8, 108u8, 108u8, 111u8])
91 # assert b.to_s == "hello"
92 redef fun append(arr) do
93 if arr isa Bytes then
94 append_ns(arr.items, arr.length)
95 else
96 for i in arr do add i
97 end
98 end
99
100 # var b = new Bytes.empty
101 # b.append([0x41u8, 0x41u8, 0x18u8])
102 # b.pop
103 # assert b.to_s == "AA"
104 redef fun pop do
105 assert length >= 1
106 length -= 1
107 return items[length]
108 end
109
110 redef fun clear do length = 0
111
112 # Regenerates the buffer, necessary when it was persisted
113 private fun regen do
114 var nns = new NativeString(capacity)
115 items.copy_to(nns, length, 0, 0)
116 persisted = false
117 end
118
119 # Appends the `ln` first bytes of `ns` to self
120 fun append_ns(ns: NativeString, ln: Int) do
121 if persisted then regen
122 var nlen = length + ln
123 if nlen > capacity then enlarge(nlen)
124 ns.copy_to(items, ln, 0, length)
125 length += ln
126 end
127
128 # Appends `ln` bytes from `ns` starting at index `from` to self
129 fun append_ns_from(ns: NativeString, ln, from: Int) do
130 if persisted then regen
131 var nlen = length + ln
132 if nlen > capacity then enlarge(nlen)
133 ns.copy_to(items, ln, from, length)
134 length += ln
135 end
136
137 redef fun enlarge(sz) do
138 if capacity >= sz then return
139 persisted = false
140 while capacity < sz do capacity = capacity * 2 + 2
141 var ns = new NativeString(capacity)
142 items.copy_to(ns, length, 0, 0)
143 items = ns
144 end
145
146 redef fun to_s do
147 persisted = true
148 var b = self
149 if not is_utf8 then
150 b = clean_utf8
151 persisted = false
152 end
153 return new FlatString.with_infos(b.items, b.length, 0, b.length -1)
154 end
155
156 redef fun iterator do return new BytesIterator.with_buffer(self)
157
158 # Is the byte collection valid UTF-8 ?
159 fun is_utf8: Bool do
160 var charst = once [0x80u8, 0u8, 0xE0u8, 0xC0u8, 0xF0u8, 0xE0u8, 0xF8u8, 0xF0u8]
161 var lobounds = once [0, 0x80, 0x800, 0x10000]
162 var hibounds = once [0x7F, 0x7FF, 0xFFFF, 0x10FFFF]
163 var pos = 0
164 var len = length
165 var mits = items
166 while pos < len do
167 var nxst = mits.length_of_char_at(pos)
168 var charst_index = (nxst - 1) * 2
169 if mits[pos] & charst[charst_index] == charst[charst_index + 1] then
170 var c = mits.char_at(pos)
171 var cp = c.ascii
172 if cp <= hibounds[nxst - 1] and cp >= lobounds[nxst - 1] then
173 if cp >= 0xD800 and cp <= 0xDFFF or
174 cp == 0xFFFE or cp == 0xFFFF then return false
175 else
176 return false
177 end
178 else
179 return false
180 end
181 pos += nxst
182 end
183 return true
184 end
185
186 # Cleans the bytes of `self` to be UTF-8 compliant
187 private fun clean_utf8: Bytes do
188 var charst = once [0x80u8, 0u8, 0xE0u8, 0xC0u8, 0xF0u8, 0xE0u8, 0xF8u8, 0xF0u8]
189 var badchar = once [0xEFu8, 0xBFu8, 0xBDu8]
190 var lobounds = once [0, 0x80, 0x800, 0x10000]
191 var hibounds = once [0x7F, 0x7FF, 0xFFFF, 0x10FFFF]
192 var pos = 0
193 var len = length
194 var ret = new Bytes.with_capacity(len)
195 var mits = items
196 while pos < len do
197 var nxst = mits.length_of_char_at(pos)
198 var charst_index = (nxst - 1) * 2
199 if mits[pos] & charst[charst_index] == charst[charst_index + 1] then
200 var c = mits.char_at(pos)
201 var cp = c.ascii
202 if cp <= hibounds[nxst - 1] and cp >= lobounds[nxst - 1] then
203 if cp >= 0xD800 and cp <= 0xDFFF or
204 cp == 0xFFFE or cp == 0xFFFF then
205 ret.append badchar
206 pos += 1
207 else
208 var pend = pos + nxst
209 for i in [pos .. pend[ do ret.add mits[i]
210 pos += nxst
211 end
212 else
213 ret.append badchar
214 pos += 1
215 end
216 else
217 ret.append badchar
218 pos += 1
219 end
220 end
221 return ret
222 end
223 end
224
225 private class BytesIterator
226 super IndexedIterator[Byte]
227
228 var tgt: NativeString
229
230 redef var index
231
232 var max: Int
233
234 init with_buffer(b: Bytes) do init(b.items, 0, b.length - 1)
235
236 redef fun is_ok do return index < max
237
238 redef fun next do index += 1
239
240 redef fun item do return tgt[index]
241 end
242
243 redef class Text
244 # Returns a mutable copy of `self`'s bytes
245 #
246 # ~~~nit
247 # assert "String".to_bytes isa Bytes
248 # assert "String".to_bytes == [83u8, 116u8, 114u8, 105u8, 110u8, 103u8]
249 # ~~~
250 fun to_bytes: Bytes do
251 var b = new Bytes.with_capacity(bytelen)
252 append_to_bytes b
253 return b
254 end
255
256 # Appends `self.bytes` to `b`
257 fun append_to_bytes(b: Bytes) do
258 for s in substrings do
259 var from = if s isa FlatString then s.first_byte else 0
260 b.append_ns_from(s.items, s.bytelen, from)
261 end
262 end
263 end
264
265 redef class FlatText
266 redef fun append_to_bytes(b) do
267 var from = if self isa FlatString then first_byte else 0
268 b.append_ns_from(items, bytelen, from)
269 end
270 end
271
272 redef class NativeString
273 # Creates a new `Bytes` object from `self` with `strlen` as length
274 fun to_bytes: Bytes do
275 var len = cstring_length
276 return new Bytes(self, len, len)
277 end
278 end