lib/core: Renamed `Text::bytelen` to `Text::byte_length`
[nit.git] / lib / csv / csv.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 # CSV document handling.
16 module csv
17
18 redef class Text
19 # Escape the content of `self` for inclusion in a CSV document
20 private fun escape_to_csv(sep_char, delim_char: Char, eol: String): String do
21 var add_sp = chars_to_escape_csv(sep_char, delim_char, eol)
22 if add_sp == 0 then return to_s
23 var bf = new Buffer.with_cap(add_sp + byte_length)
24 bf.add '"'
25 for i in [0 .. length[ do
26 var c = self[i]
27 if c == delim_char then
28 bf.add c
29 end
30 bf.add c
31 end
32 bf.add '"'
33 return bf.to_s
34 end
35
36 # How many more bytes should be allocated for CSV escaping ?
37 private fun chars_to_escape_csv(sep_char, delim_char: Char, eol: String): Int do
38 var more_ln = 0
39 var ln = length
40 var need_esc = false
41 var fst_eol = eol.first
42 var i = 0
43 while i < ln do
44 var c = self[i]
45 if c == delim_char then more_ln += 1
46 if c == fst_eol then
47 need_esc = true
48 for j in [1 .. eol.length[ do
49 i += 1
50 c = self[i]
51 if c != eol[j] then
52 i -= j
53 need_esc = false
54 break
55 end
56 end
57 end
58 if c == sep_char then need_esc = true
59 i += 1
60 end
61 var more = more_ln * delim_char.u8char_len
62 if need_esc then more += 2
63 return more
64 end
65
66 # Unescape the content of `self` from CSV format to Nit String
67 private fun unescape_csv(delim_char: Char): String do
68 var to_un = chars_to_unescape_csv(delim_char)
69 if to_un == 0 then return to_s
70 var buf = new Buffer.with_cap(byte_length - to_un)
71 var pos = 0
72 var ln = length
73 while pos < ln do
74 var c = self[pos]
75 if c == delim_char then pos += 1
76 buf.add c
77 pos += 1
78 end
79 return buf.to_s
80 end
81
82 # How many bytes should be removed for CSV unescaping ?
83 private fun chars_to_unescape_csv(delim_char: Char): Int do
84 var pos = 0
85 var to_un = 0
86 var ln = length
87 while pos < ln do
88 var c = self[pos]
89 if c == delim_char then
90 pos += 1
91 to_un += 1
92 end
93 pos += 1
94 end
95 return to_un
96 end
97 end
98
99 # Shared properties by all CSV-related classes
100 #
101 # This class is basically only here for implementation purposes and should not be used
102 # by clients for typing.
103 abstract class CsvStream
104 # The character that delimits escaped value.
105 #
106 # The delimiter is escaped by doubling it.
107 var delimiter = '"' is writable
108
109 # The character that split each cell in a record.
110 var separator = ',' is writable
111
112 # The character that ends a record (end of line).
113 var eol = "\n" is writable
114 end
115
116 # A CSV document representation.
117 class CsvDocument
118 super Writable
119 super CsvStream
120
121 # The header.
122 #
123 # Contains the name of all fields in this table.
124 var header = new Array[String] is writable, optional
125
126 # The list of the records.
127 #
128 # All records must have the same length than `header`.
129 var records = new Array[Array[String]] is writable, optional
130
131 # Adds a new record to document containing the values in `objs`
132 fun add_record(objs: Object...) do
133 var ln = new Array[String].with_capacity(objs.length)
134 for i in objs do ln.add(i.to_s)
135 records.add ln
136 end
137
138 redef fun write_to(stream) do
139 var s = new CsvWriter(stream)
140 s.separator = separator
141 s.eol = eol
142 s.delimiter = delimiter
143 if not header.is_empty then
144 s.write_line header
145 end
146 s.write_lines(records)
147 end
148
149 # Load from the specified stream.
150 #
151 # Parameters:
152 #
153 # * `stream`: Input stream.
154 # * `has_header`: Is the first record the header? - defaults to true
155 # * `skip_empty`: Do we skip the empty lines? - defaults to true
156 fun load_from(stream: Reader, has_header: nullable Bool, skip_empty: nullable Bool) do
157 if has_header == null then has_header = true
158 if skip_empty == null then skip_empty = true
159 var reader = new CsvReader(stream)
160 reader.separator = separator
161 reader.eol = eol
162 reader.delimiter = delimiter
163 reader.skip_empty = skip_empty
164 end
165 end
166
167 # Appends CSV records to a file.
168 #
169 # By default, uses the format recommended by RFC 4180 (see `rfc4180`).
170 #
171 # Note: If a record contains only an empty cell, its representation is
172 # undistinguishable from an empty line. This is because the empty values are
173 # always written unescaped in order to avoid them to be interpreted as escaped
174 # delimiters by some parsers.
175 #
176 # ~~~nit
177 # var out = new StringWriter
178 # var writer = new CsvWriter(out)
179 # writer.write_elements(1, 2.0, "foo\nbar")
180 # writer.write_line([""])
181 # assert out.to_s == """1,2.0,"foo\nbar"\n\n"""
182 # ~~~
183 class CsvWriter
184 super CsvStream
185
186 # The output stream.
187 var ostream: Writer
188
189 # Write several lines to a stream
190 fun write_lines(lines: Array[Array[Object]]) do for i in lines do write_line i
191
192 # Append the elements in `els` as a record.
193 #
194 # The representation of each cell is determined by `to_s`.
195 fun write_elements(els: Object...) do
196 var os = ostream
197 var esc = delimiter
198 var sep = separator
199 var eol = eol
200 for i in [0 .. els.length - 1[ do
201 os.write(els[i].to_s.escape_to_csv(sep, esc, eol))
202 os.write_char(sep)
203 end
204 os.write(els.last.to_s.escape_to_csv(sep, esc, eol))
205 os.write(eol)
206 end
207
208 # Append the specified record.
209 #
210 # The representation of each cell is determined by `to_s`.
211 fun write_line(line: Array[Object]) do
212 var os = ostream
213 var esc = delimiter
214 var sep = separator
215 var eol = eol
216 for i in [0 .. line.length - 1[ do
217 os.write(line[i].to_s.escape_to_csv(sep, esc, eol))
218 os.write_char(sep)
219 end
220 os.write(line.last.to_s.escape_to_csv(sep, esc, eol))
221 os.write(eol)
222 end
223 end
224
225 # Reads records from a CSV file.
226 #
227 # By default, the format recognizes EOLs as `\n`
228 #
229 # ~~~nit
230 # var example = """
231 # foo,bar
232 # "Hello, word!",1234.5 + 42
233 # "Something
234 # ""else""\", baz
235 # """
236 # var reader = new CsvReader.from_string(example)
237 # var table = reader.read_all
238 #
239 # assert table.header == ["foo","bar"]
240 # assert table.records == [["Hello, word!","1234.5 + 42"],
241 # ["Something\n\"else\""," baz"]]
242 # ~~~
243 class CsvReader
244 super CsvStream
245
246 # The input stream.
247 var istream: Reader
248
249 # Do we skip the empty lines?
250 #
251 # Note: Even if this attribute is `false`, the presence of an line ending at
252 # end of the last record does not change the number of returned record.
253 # This is because the line endings are processed as terminators, not as
254 # separators. Therefore, when there is more than one line ending at the end
255 # of the file, the additional lines are interpreted as empty records that
256 # are skipped only if `skip_empty` is set to `true`.
257 #
258 # `false` by default.
259 var skip_empty: Bool = false is writable
260
261 # Creates a new CSVReader from a `string` data
262 init from_string(s: String) do init(new StringReader(s))
263
264 # Reads the content of the Stream and interprets it as a CSV Document
265 #
266 # Optional parameter `has_header` determines whether the first line
267 # of the CSV Document is header data.
268 # Defaults to true
269 fun read_all(has_header: nullable Bool): CsvDocument do
270 var header: nullable Array[String] = null
271 if has_header == null then has_header = true
272 var iss = istream
273 var res_data = new Array[Array[String]]
274 var eol_st = eol.first
275 var line = new Array[String]
276 var esc = delimiter
277 var sep = separator
278 var eol = eol
279 var is_eol = false
280 var eol_buf = new Buffer.with_cap(eol.length)
281 var c = iss.read_char
282 var el = new Buffer
283 while not iss.eof do
284 if c == null then continue
285 loop
286 if c == esc then
287 c = iss.read_char
288 loop
289 if c == esc then
290 c = iss.read_char
291 if c != esc then break
292 end
293 if c == null then break
294 el.add c
295 c = iss.read_char
296 end
297 end
298 if c == sep then break
299 if c == eol_st then
300 eol_buf.add c.as(not null)
301 is_eol = true
302 for i in [1 .. eol.length[ do
303 c = iss.read_char
304 if c == null or c != eol[i] then
305 is_eol = false
306 el.append(eol_buf)
307 eol_buf.clear
308 break
309 end
310 eol_buf.add c
311 end
312 if not is_eol then continue
313 eol_buf.clear
314 break
315 end
316 if c == sep then break
317 el.add c.as(not null)
318 c = iss.read_char
319 if c == null then break
320 end
321 line.add el.to_s
322 el.clear
323 if is_eol or iss.eof then
324 c = iss.read_char
325 is_eol = false
326 if skip_empty and line.is_empty then
327 continue
328 end
329 if has_header and header == null then
330 header = line
331 else res_data.add line
332 line = new Array[String]
333 end
334 if c == sep then c = iss.read_char
335 end
336 if header == null then header = new Array[String]
337 var doc = new CsvDocument
338 doc.header = header
339 doc.records = res_data
340 return doc
341 end
342 end