csv: Allow output customization.
[nit.git] / lib / csv / csv.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 # CSV document handling.
16 module csv
17
18 # Specifies a CSV format.
19 class CsvFormat
20 # The character that delimits escaped value.
21 #
22 # The delimiter is escaped by doubling it.
23 var delimiter: Char
24
25 # The character that split each cell in a row.
26 var separator: Char
27
28 # The character that ends a row (end of line).
29 var eol: String
30
31 # Escape sequence for the delimiter.
32 private var escaping = "{delimiter}{delimiter}" is lazy
33
34 # Escape the specified cell.
35 private fun escape_cell(cell: String): Text do
36 var result = new RopeBuffer
37 result.add delimiter
38 result.append cell.replace(delimiter, escaping)
39 result.add delimiter
40 return result
41 end
42
43 # Can the specified value be inserted without any escaping?
44 private fun is_value_clean(value: String): Bool do
45 for c in value.chars do
46 if c == delimiter then return false
47 if c == separator then return false
48 if eol.chars.has(c) then return false
49 end
50 return true
51 end
52 end
53
54 # A CSV document representation.
55 class CsvDocument
56 super Streamable
57
58 # The format to use.
59 #
60 # Defaults to `rfc4180`.
61 var format: CsvFormat = rfc4180 is writable
62
63 # The header.
64 #
65 # Contains the name of all fields in this table.
66 var header: Array[String] = new Array[String] is writable
67
68 # The list of the records.
69 #
70 # All records must have the same length than `header`.
71 var records: Array[Array[String]] = new Array[Array[String]]
72
73 # Replace the header by the specified row.
74 fun set_header(values: Object...) do
75 header.clear
76 for value in values do header.add(value.to_s)
77 end
78
79 # Append the specfied record.
80 fun add_record(values: Object...) do
81 assert values.length == header.length else
82 sys.stderr.write "CSV error: Header declares {header.length} columns, record contains {values.length} values.\n"
83 end
84 var record = new Array[String]
85 for value in values do record.add(value.to_s)
86 records.add(record)
87 end
88
89 redef fun write_to(stream) do
90 var writer = new CsvWriter.with_format(stream, format)
91 writer.write_sequence(header)
92 for record in records do writer.write_sequence(record)
93 end
94
95 # Deprecated alias for `write_to_file`.
96 fun save(file: String) do write_to_file(file)
97 end
98
99 # Appends CSV rows to a file.
100 #
101 # By default, uses the format recommended by RFC 4180 (see `rfc4180`).
102 #
103 # Note: If a row contains only an empty cell, its representation is
104 # undistinguishable from an empty line. This is because the empty values are
105 # always written unescaped in order to avoid them to be interpreted as escaped
106 # delimiters by some parsers.
107 #
108 # ~~~nit
109 # var out = new StringOStream
110 # var writer = new CsvWriter(out)
111 # writer.write_row(1, 2.0, "foo\nbar")
112 # writer.write_sequence([""])
113 # assert out.to_s == """1,2.0,"foo\nbar"\r\n\r\n"""
114 # ~~~
115 class CsvWriter
116
117 # The output stream.
118 var ostream: OStream
119
120 # The format to use.
121 #
122 # Defaults to `rfc4180`.
123 var format: CsvFormat = rfc4180
124
125 # Do we escape all cells (except empty ones)?
126 #
127 # If `false` (the default), escape only cells that contain a metacharacter
128 # of the format. In all cases, empty cells are not escaped. This option
129 # permits to choose between the optimization of the performances (when
130 # `true`) and optimization of the size of the output (when `false`).
131 #
132 # Note: Escaping may not be correctly recognized by some parsers.
133 var always_escape = false is writable
134
135 # Create a new writer with the specified format.
136 init with_format(ostream:OStream, format: CsvFormat) do
137 self.ostream = ostream
138 self.format = format
139 end
140
141 # Append the specified sequence as a row.
142 #
143 # The representation of each cell is determined by `to_s`.
144 fun write_sequence(row: SequenceRead[Object]) do
145 if not row.is_empty then
146 var i = row.iterator
147 var separator = format.separator.to_s
148 write_cell i.item.to_s
149 i.next
150 for cell in i do
151 ostream.write separator
152 write_cell cell.to_s
153 end
154 end
155 ostream.write format.eol
156 end
157
158 # Append the specified row.
159 #
160 # The representation of each cell is determined by `to_s`.
161 fun write_row(row: Object...) do write_sequence(row)
162
163 # Close the output stream.
164 fun close do ostream.close
165
166 private fun write_cell(cell: String) do
167 if cell.is_empty then return
168 if not always_escape and format.is_value_clean(cell) then
169 ostream.write cell
170 else
171 ostream.write format.escape_cell(cell)
172 end
173 end
174 end
175
176 # The CSV format recommended by RFC 4180.
177 #
178 # * `delimiter`: `'"'`
179 # * `separator`: `','`
180 # * `eol`: `"\r\n"`
181 fun rfc4180: CsvFormat do return once new CsvFormat('"', ',', "\r\n")