From f2644e530b87ba42bd8d0fc9a4ef042fd66dec84 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Jean-Christophe=20Beaupr=C3=A9?= Date: Tue, 9 Dec 2014 12:31:59 -0500 Subject: [PATCH] csv: Allow output customization. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Also, escape cells when needed. Signed-off-by: Jean-Christophe Beaupré --- lib/csv/csv.nit | 145 +++++++++++++++++++++++++++++++++++----- lib/csv/test_csv.nit | 85 +++++++++++++++++++++++ src/metrics/mendel_metrics.nit | 1 + src/metrics/metrics_base.nit | 2 + src/rapid_type_analysis.nit | 1 + 5 files changed, 217 insertions(+), 17 deletions(-) create mode 100644 lib/csv/test_csv.nit diff --git a/lib/csv/csv.nit b/lib/csv/csv.nit index e3d0c0b..257d565 100644 --- a/lib/csv/csv.nit +++ b/lib/csv/csv.nit @@ -12,13 +12,54 @@ # See the License for the specific language governing permissions and # limitations under the License. -# CSV output facilities +# CSV document handling. module csv +# Specifies a CSV format. +class CsvFormat + # The character that delimits escaped value. + # + # The delimiter is escaped by doubling it. + var delimiter: Char + + # The character that split each cell in a row. + var separator: Char + + # The character that ends a row (end of line). + var eol: String + + # Escape sequence for the delimiter. + private var escaping = "{delimiter}{delimiter}" is lazy + + # Escape the specified cell. + private fun escape_cell(cell: String): Text do + var result = new RopeBuffer + result.add delimiter + result.append cell.replace(delimiter, escaping) + result.add delimiter + return result + end + + # Can the specified value be inserted without any escaping? + private fun is_value_clean(value: String): Bool do + for c in value.chars do + if c == delimiter then return false + if c == separator then return false + if eol.chars.has(c) then return false + end + return true + end +end + # A CSV document representation. class CsvDocument super Streamable + # The format to use. + # + # Defaults to `rfc4180`. + var format: CsvFormat = rfc4180 is writable + # The header. # # Contains the name of all fields in this table. @@ -45,26 +86,96 @@ class CsvDocument records.add(record) end - private fun write_line_to(line: Collection[String], stream: OStream) - do - var i = line.iterator - if i.is_ok then - stream.write(i.item) + redef fun write_to(stream) do + var writer = new CsvWriter.with_format(stream, format) + writer.write_sequence(header) + for record in records do writer.write_sequence(record) + end + + # Deprecated alias for `write_to_file`. + fun save(file: String) do write_to_file(file) +end + +# Appends CSV rows to a file. +# +# By default, uses the format recommended by RFC 4180 (see `rfc4180`). +# +# Note: If a row contains only an empty cell, its representation is +# undistinguishable from an empty line. This is because the empty values are +# always written unescaped in order to avoid them to be interpreted as escaped +# delimiters by some parsers. +# +# ~~~nit +# var out = new StringOStream +# var writer = new CsvWriter(out) +# writer.write_row(1, 2.0, "foo\nbar") +# writer.write_sequence([""]) +# assert out.to_s == """1,2.0,"foo\nbar"\r\n\r\n""" +# ~~~ +class CsvWriter + + # The output stream. + var ostream: OStream + + # The format to use. + # + # Defaults to `rfc4180`. + var format: CsvFormat = rfc4180 + + # Do we escape all cells (except empty ones)? + # + # If `false` (the default), escape only cells that contain a metacharacter + # of the format. In all cases, empty cells are not escaped. This option + # permits to choose between the optimization of the performances (when + # `true`) and optimization of the size of the output (when `false`). + # + # Note: Escaping may not be correctly recognized by some parsers. + var always_escape = false is writable + + # Create a new writer with the specified format. + init with_format(ostream:OStream, format: CsvFormat) do + self.ostream = ostream + self.format = format + end + + # Append the specified sequence as a row. + # + # The representation of each cell is determined by `to_s`. + fun write_sequence(row: SequenceRead[Object]) do + if not row.is_empty then + var i = row.iterator + var separator = format.separator.to_s + write_cell i.item.to_s i.next - while i.is_ok do - stream.write(";") - stream.write(i.item) - i.next + for cell in i do + ostream.write separator + write_cell cell.to_s end end - stream.write("\n") + ostream.write format.eol end - redef fun write_to(stream) do - write_line_to(header, stream) - for record in records do write_line_to(record, stream) - end + # Append the specified row. + # + # The representation of each cell is determined by `to_s`. + fun write_row(row: Object...) do write_sequence(row) - # Deprecated alias for `write_to_file`. - fun save(file: String) do write_to_file(file) + # Close the output stream. + fun close do ostream.close + + private fun write_cell(cell: String) do + if cell.is_empty then return + if not always_escape and format.is_value_clean(cell) then + ostream.write cell + else + ostream.write format.escape_cell(cell) + end + end end + +# The CSV format recommended by RFC 4180. +# +# * `delimiter`: `'"'` +# * `separator`: `','` +# * `eol`: `"\r\n"` +fun rfc4180: CsvFormat do return once new CsvFormat('"', ',', "\r\n") diff --git a/lib/csv/test_csv.nit b/lib/csv/test_csv.nit new file mode 100644 index 0000000..c176a67 --- /dev/null +++ b/lib/csv/test_csv.nit @@ -0,0 +1,85 @@ +# This file is part of NIT ( http://www.nitlanguage.org ). +# +# This file is free software, which comes along with NIT. This software is +# distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; +# without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. You can modify it is you want, provided this header +# is kept unaltered, and a notification of the changes is added. +# You are allowed to redistribute it and sell it, alone or is a part of +# another product. + +# Tests for `csv`. +module test_csv is test_suite + +import test_suite +import csv + +class TestCsvWriter + super TestSuite + + # The custom CSV format used in the tests. + private var custom_format = new CsvFormat('/', ':', "#") + + # Expect to write `row` as `expected_rfc4180` and as `expected_custom`. + # + # Parameters: + # + # * `always_escape`: value of the `always_escape` option. + # * `row`: row to write. + # * `expected_rfc4180`: expected result in RFC 4180. + # * `expected_custom`: expected result in the custom CSV format. + private fun expect(always_escape: Bool, row: SequenceRead[String], + expected_rfc4180: String, + expected_custom: String) do + var out = new StringOStream + var writer = new CsvWriter(out) + + writer.always_escape = always_escape + writer.write_sequence(row) + assert out.to_s == expected_rfc4180 else + sys.stderr.write "\nFormat: RFC 4180\n" + sys.stderr.write "Expecting: \"{expected_rfc4180.escape_to_nit}\"\n" + sys.stderr.write "Got: \"{out.to_s.escape_to_nit}\"\n" + end + writer.close + + out = new StringOStream + writer = new CsvWriter.with_format(out, custom_format) + writer.always_escape = always_escape + writer.write_sequence(row) + assert out.to_s == expected_custom else + sys.stderr.write "\nFormat: {custom_format.delimiter}" + sys.stderr.write " {custom_format.separator}" + sys.stderr.write " {custom_format.eol.escape_to_nit}\n" + sys.stderr.write "Expecting: \"{expected_custom.escape_to_nit}\"\n" + sys.stderr.write "Got: \"{out.to_s.escape_to_nit}\"\n" + end + writer.close + end + + fun test_empty do expect(true, new Array[String], "\r\n", "#") + + fun test_one_cell do expect(true, ["foo/\"\r\n,"], + "\"foo/\"\"\r\n,\"\r\n", + "/foo//\"\r\n,/#") + + fun test_optimize_size_escaped do expect(false, ["foo/\"\r\n,"], + "\"foo/\"\"\r\n,\"\r\n", + "/foo//\"\r\n,/#") + + fun test_optimize_size_eol do expect(false, ["foo\r#\n"], + "\"foo\r#\n\"\r\n", + "/foo\r#\n/#") + + fun test_optimize_size_unescaped do expect(false, ["foo"], + "foo\r\n", + "foo#") + + fun test_multiple_cells do expect(true, ["1", "", "/"], + "\"1\",,\"/\"\r\n", + "/1/::////#") + + fun test_multiple_cells_optimize_size do expect(false, ["1", "", "/"], + "1,,/\r\n", + "1::////#") +end diff --git a/src/metrics/mendel_metrics.nit b/src/metrics/mendel_metrics.nit index 80c4ed8..1ee1db2 100644 --- a/src/metrics/mendel_metrics.nit +++ b/src/metrics/mendel_metrics.nit @@ -103,6 +103,7 @@ private class MendelMetricsPhase if csv then var csvh = new CsvDocument + csvh.format = new CsvFormat('"', ';', "\n") csvh.header = ["povr", "ovr", "pext", "ext", "pspe", "spe", "prep", "rep", "eq"] for mclass in mclasses do var povr = mclass.is_pure_overrider(vis).object_id diff --git a/src/metrics/metrics_base.nit b/src/metrics/metrics_base.nit index c99ec62..c51e656 100644 --- a/src/metrics/metrics_base.nit +++ b/src/metrics/metrics_base.nit @@ -370,6 +370,8 @@ class MetricSet fun to_csv: CsvDocument do var csv = new CsvDocument + csv.format = new CsvFormat('"', ';', "\n") + # set csv headers csv.header.add("entry") for metric in metrics do csv.header.add(metric.name) diff --git a/src/rapid_type_analysis.nit b/src/rapid_type_analysis.nit index b4cf4ab..7881db0 100644 --- a/src/rapid_type_analysis.nit +++ b/src/rapid_type_analysis.nit @@ -122,6 +122,7 @@ class RapidTypeAnalysis var types = typeset.to_a (new CachedAlphaComparator).sort(types) var res = new CsvDocument + res.format = new CsvFormat('"', ';', "\n") res.header = ["Type", "Resolution", "Liveness", "Cast-liveness"] for t in types do var reso -- 1.7.9.5