csv: Add a reader.
[nit.git] / lib / csv / csv.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 # CSV document handling.
16 module csv
17
18 # Specifies a CSV format.
19 class CsvFormat
20 # The character that delimits escaped value.
21 #
22 # The delimiter is escaped by doubling it.
23 var delimiter: Char
24
25 # The character that split each cell in a row.
26 var separator: Char
27
28 # The character that ends a row (end of line).
29 var eol: String
30
31 # Escape sequence for the delimiter.
32 private var escaping = "{delimiter}{delimiter}" is lazy
33
34 # Escape the specified cell.
35 private fun escape_cell(cell: String): Text do
36 var result = new RopeBuffer
37 result.add delimiter
38 result.append cell.replace(delimiter, escaping)
39 result.add delimiter
40 return result
41 end
42
43 # Can the specified value be inserted without any escaping?
44 private fun is_value_clean(value: String): Bool do
45 for c in value.chars do
46 if c == delimiter then return false
47 if c == separator then return false
48 if eol.chars.has(c) then return false
49 end
50 return true
51 end
52 end
53
54 # A CSV document representation.
55 class CsvDocument
56 super Streamable
57
58 # The format to use.
59 #
60 # Defaults to `rfc4180`.
61 var format: CsvFormat = rfc4180 is writable
62
63 # The header.
64 #
65 # Contains the name of all fields in this table.
66 var header: Array[String] = new Array[String] is writable
67
68 # The list of the records.
69 #
70 # All records must have the same length than `header`.
71 var records: Array[Array[String]] = new Array[Array[String]]
72
73 # Replace the header by the specified row.
74 fun set_header(values: Object...) do
75 header.clear
76 for value in values do header.add(value.to_s)
77 end
78
79 # Append the specfied record.
80 fun add_record(values: Object...) do
81 assert values.length == header.length else
82 sys.stderr.write "CSV error: Header declares {header.length} columns, record contains {values.length} values.\n"
83 end
84 var record = new Array[String]
85 for value in values do record.add(value.to_s)
86 records.add(record)
87 end
88
89 redef fun write_to(stream) do
90 var writer = new CsvWriter.with_format(stream, format)
91 writer.write_sequence(header)
92 for record in records do writer.write_sequence(record)
93 end
94
95 # Deprecated alias for `write_to_file`.
96 fun save(file: String) do write_to_file(file)
97
98 # Load from the specified stream.
99 #
100 # Parameters:
101 #
102 # * `stream`: Input stream.
103 # * `has_header`: Is the first row the header?
104 # * `skip_empty`: Do we skip the empty lines?
105 # For details, see `CsvReader.skip_empty`.
106 fun load_from(stream: IStream, has_header: Bool, skip_empty: Bool) do
107 var reader = new CsvReader.with_format(stream, format)
108 reader.skip_empty = skip_empty
109 if has_header then
110 if reader.is_ok then
111 header = reader.item
112 else
113 header.clear
114 end
115 end
116 records.clear
117 for record in reader do records.add(record)
118 end
119
120 # Load from the specified file.
121 #
122 # Parameters:
123 #
124 # * `path`: Path of the file.
125 # * `has_header`: Is the first row the header?
126 # * `skip_empty`: Do we skip the empty lines?
127 fun load(path: String, has_header: Bool, skip_empty: Bool) do
128 var istream = new IFStream.open(path)
129 load_from(istream, has_header, skip_empty)
130 istream.close
131 end
132 end
133
134 # Appends CSV rows to a file.
135 #
136 # By default, uses the format recommended by RFC 4180 (see `rfc4180`).
137 #
138 # Note: If a row contains only an empty cell, its representation is
139 # undistinguishable from an empty line. This is because the empty values are
140 # always written unescaped in order to avoid them to be interpreted as escaped
141 # delimiters by some parsers.
142 #
143 # ~~~nit
144 # var out = new StringOStream
145 # var writer = new CsvWriter(out)
146 # writer.write_row(1, 2.0, "foo\nbar")
147 # writer.write_sequence([""])
148 # assert out.to_s == """1,2.0,"foo\nbar"\r\n\r\n"""
149 # ~~~
150 class CsvWriter
151
152 # The output stream.
153 var ostream: OStream
154
155 # The format to use.
156 #
157 # Defaults to `rfc4180`.
158 var format: CsvFormat = rfc4180
159
160 # Do we escape all cells (except empty ones)?
161 #
162 # If `false` (the default), escape only cells that contain a metacharacter
163 # of the format. In all cases, empty cells are not escaped. This option
164 # permits to choose between the optimization of the performances (when
165 # `true`) and optimization of the size of the output (when `false`).
166 #
167 # Note: Escaping may not be correctly recognized by some parsers.
168 var always_escape = false is writable
169
170 # Create a new writer with the specified format.
171 init with_format(ostream:OStream, format: CsvFormat) do
172 self.ostream = ostream
173 self.format = format
174 end
175
176 # Append the specified sequence as a row.
177 #
178 # The representation of each cell is determined by `to_s`.
179 fun write_sequence(row: SequenceRead[Object]) do
180 if not row.is_empty then
181 var i = row.iterator
182 var separator = format.separator.to_s
183 write_cell i.item.to_s
184 i.next
185 for cell in i do
186 ostream.write separator
187 write_cell cell.to_s
188 end
189 end
190 ostream.write format.eol
191 end
192
193 # Append the specified row.
194 #
195 # The representation of each cell is determined by `to_s`.
196 fun write_row(row: Object...) do write_sequence(row)
197
198 # Close the output stream.
199 fun close do ostream.close
200
201 private fun write_cell(cell: String) do
202 if cell.is_empty then return
203 if not always_escape and format.is_value_clean(cell) then
204 ostream.write cell
205 else
206 ostream.write format.escape_cell(cell)
207 end
208 end
209 end
210
211 # Reads rows from a CSV file.
212 #
213 # By default, uses the format recommended by RFC 4180 (see `rfc4180`).
214 #
215 # ~~~nit
216 # var example = new StringIStream("""
217 # foo,bar\r
218 # "Hello, word!",1234.5 + 42\r
219 # "Something\r
220 # ""else""\", baz\r
221 # """)
222 # var reader = new CsvReader(example)
223 # var table = new Array[Array[String]]
224 #
225 # for row in reader do table.add row
226 # assert table == [
227 # ["foo","bar"],
228 # ["Hello, word!","1234.5 + 42"],
229 # ["Something\r\n\"else\""," baz"]
230 # ]
231 # ~~~
232 class CsvReader
233 super Iterator[Array[String]]
234
235 # The input stream.
236 var istream: IStream
237
238 # The format to use.
239 #
240 # Defaults to `rfc4180`.
241 var format: CsvFormat = rfc4180 is lazy
242
243 # Do we skip the empty lines?
244 #
245 # Note: Even if this attribute is `false`, the presence of an line ending at
246 # end of the last row does not change the number of returned rows.
247 # This is because the line endings are processed as terminators, not as
248 # separators. Therefore, when there is more than one line ending at the end
249 # of the file, the additional lines are interpreted as empty rows that
250 # are skipped only if `skip_empty` is set to `true`.
251 #
252 # `false` by default.
253 var skip_empty: Bool = false is writable
254
255 # The last read row.
256 private var row: nullable Array[String] = null
257
258 # Did we read something?
259 private var started = false
260
261 # Create a new reader with the specified format.
262 init with_format(istream:IStream, format: CsvFormat) do
263 self.istream = istream
264 self.format = format
265 end
266
267 # Read the first row, if needed.
268 fun prepare do
269 if not started then
270 row = read_row
271 started = true
272 end
273 end
274
275 redef fun next do
276 prepare
277 assert is_ok else
278 sys.stderr.write "Already at the end of the stream.\n"
279 end
280 row = read_row
281 end
282
283 # Return the last read row.
284 redef fun item do
285 prepare
286 return row.as(not null)
287 end
288
289 redef fun is_ok do
290 prepare
291 return row != null
292 end
293
294 # Free some internal ressources and set `is_ok` to `false`.
295 #
296 # Do not close the input stream.
297 redef fun finish do row = null
298
299 # Close the input stream.
300 fun close do istream.close
301
302 private fun read_row: nullable Array[String] do
303 if istream.eof then return null
304 var row = new Array[String]
305 var value = new RopeBuffer
306
307 # Number of unescaped characters since the last delimiter or separator.
308 var unescaped = 0
309
310 # Do we read the start of a row?
311 var got_row = false
312
313 # Do we found a delimited string in the current cell?
314 var got_delimiter = false
315
316 loop
317 var i = istream.read_char
318 var c: Char
319
320 if i < 0 then
321 if got_row then
322 row.add value.to_s
323 return row
324 else
325 return null
326 end
327 end
328 c = i.ascii
329
330 if c == format.delimiter then
331 if got_delimiter and unescaped == 0 then
332 # Got an escaped delimiter.
333 value.add format.delimiter
334 end
335 # Read all bytes until the delimiter.
336 loop
337 i = istream.read_char
338 assert not_eof: i >= 0 else
339 sys.stderr.write "Unexpected end of file before the end of a delimited value.\n"
340 end
341 c = i.ascii
342 if c == format.delimiter then break
343 value.add c
344 end
345 unescaped = 0
346 got_row = true
347 got_delimiter = true
348 else if c == format.separator then
349 # Flush the value to the row.
350 row.add value.to_s
351 value.clear
352 unescaped = 0
353 got_delimiter = false
354 else
355 value.add c
356 unescaped += 1
357 if unescaped >= format.eol.length and
358 value.has_suffix(format.eol) then
359 var value_trimed = value.substring(0,
360 value.length - format.eol.length).to_s
361 if skip_empty and row.is_empty and
362 value_trimed.is_empty and
363 not got_delimiter then
364 # Skip the empty line.
365 value.clear
366 unescaped = 0
367 got_row = false
368 else
369 row.add value_trimed
370 return row
371 end
372 else
373 got_row = true
374 end
375 end
376 end
377 end
378 end
379
380 # The CSV format recommended by [RFC 4180](https://tools.ietf.org/html/rfc4180).
381 #
382 # * `delimiter`: `'"'`
383 # * `separator`: `','`
384 # * `eol`: `"\r\n"`
385 fun rfc4180: CsvFormat do return once new CsvFormat('"', ',', "\r\n")