From d512729632d5642e60a8ebb11134fad707ee8230 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Jean-Christophe=20Beaupr=C3=A9?= Date: Tue, 9 Dec 2014 12:15:05 -0500 Subject: [PATCH] csv: Add a reader. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Signed-off-by: Jean-Christophe Beaupré --- lib/csv/csv.nit | 206 +++++++++++++++++++++++++++++++++++++++++++++++++- lib/csv/test_csv.nit | 117 ++++++++++++++++++++++++++++ 2 files changed, 322 insertions(+), 1 deletion(-) diff --git a/lib/csv/csv.nit b/lib/csv/csv.nit index 257d565..441f95c 100644 --- a/lib/csv/csv.nit +++ b/lib/csv/csv.nit @@ -94,6 +94,41 @@ class CsvDocument # Deprecated alias for `write_to_file`. fun save(file: String) do write_to_file(file) + + # Load from the specified stream. + # + # Parameters: + # + # * `stream`: Input stream. + # * `has_header`: Is the first row the header? + # * `skip_empty`: Do we skip the empty lines? + # For details, see `CsvReader.skip_empty`. + fun load_from(stream: IStream, has_header: Bool, skip_empty: Bool) do + var reader = new CsvReader.with_format(stream, format) + reader.skip_empty = skip_empty + if has_header then + if reader.is_ok then + header = reader.item + else + header.clear + end + end + records.clear + for record in reader do records.add(record) + end + + # Load from the specified file. + # + # Parameters: + # + # * `path`: Path of the file. + # * `has_header`: Is the first row the header? + # * `skip_empty`: Do we skip the empty lines? + fun load(path: String, has_header: Bool, skip_empty: Bool) do + var istream = new IFStream.open(path) + load_from(istream, has_header, skip_empty) + istream.close + end end # Appends CSV rows to a file. @@ -173,7 +208,176 @@ class CsvWriter end end -# The CSV format recommended by RFC 4180. +# Reads rows from a CSV file. +# +# By default, uses the format recommended by RFC 4180 (see `rfc4180`). +# +# ~~~nit +# var example = new StringIStream(""" +# foo,bar\r +# "Hello, word!",1234.5 + 42\r +# "Something\r +# ""else""\", baz\r +# """) +# var reader = new CsvReader(example) +# var table = new Array[Array[String]] +# +# for row in reader do table.add row +# assert table == [ +# ["foo","bar"], +# ["Hello, word!","1234.5 + 42"], +# ["Something\r\n\"else\""," baz"] +# ] +# ~~~ +class CsvReader + super Iterator[Array[String]] + + # The input stream. + var istream: IStream + + # The format to use. + # + # Defaults to `rfc4180`. + var format: CsvFormat = rfc4180 is lazy + + # Do we skip the empty lines? + # + # Note: Even if this attribute is `false`, the presence of an line ending at + # end of the last row does not change the number of returned rows. + # This is because the line endings are processed as terminators, not as + # separators. Therefore, when there is more than one line ending at the end + # of the file, the additional lines are interpreted as empty rows that + # are skipped only if `skip_empty` is set to `true`. + # + # `false` by default. + var skip_empty: Bool = false is writable + + # The last read row. + private var row: nullable Array[String] = null + + # Did we read something? + private var started = false + + # Create a new reader with the specified format. + init with_format(istream:IStream, format: CsvFormat) do + self.istream = istream + self.format = format + end + + # Read the first row, if needed. + fun prepare do + if not started then + row = read_row + started = true + end + end + + redef fun next do + prepare + assert is_ok else + sys.stderr.write "Already at the end of the stream.\n" + end + row = read_row + end + + # Return the last read row. + redef fun item do + prepare + return row.as(not null) + end + + redef fun is_ok do + prepare + return row != null + end + + # Free some internal ressources and set `is_ok` to `false`. + # + # Do not close the input stream. + redef fun finish do row = null + + # Close the input stream. + fun close do istream.close + + private fun read_row: nullable Array[String] do + if istream.eof then return null + var row = new Array[String] + var value = new RopeBuffer + + # Number of unescaped characters since the last delimiter or separator. + var unescaped = 0 + + # Do we read the start of a row? + var got_row = false + + # Do we found a delimited string in the current cell? + var got_delimiter = false + + loop + var i = istream.read_char + var c: Char + + if i < 0 then + if got_row then + row.add value.to_s + return row + else + return null + end + end + c = i.ascii + + if c == format.delimiter then + if got_delimiter and unescaped == 0 then + # Got an escaped delimiter. + value.add format.delimiter + end + # Read all bytes until the delimiter. + loop + i = istream.read_char + assert not_eof: i >= 0 else + sys.stderr.write "Unexpected end of file before the end of a delimited value.\n" + end + c = i.ascii + if c == format.delimiter then break + value.add c + end + unescaped = 0 + got_row = true + got_delimiter = true + else if c == format.separator then + # Flush the value to the row. + row.add value.to_s + value.clear + unescaped = 0 + got_delimiter = false + else + value.add c + unescaped += 1 + if unescaped >= format.eol.length and + value.has_suffix(format.eol) then + var value_trimed = value.substring(0, + value.length - format.eol.length).to_s + if skip_empty and row.is_empty and + value_trimed.is_empty and + not got_delimiter then + # Skip the empty line. + value.clear + unescaped = 0 + got_row = false + else + row.add value_trimed + return row + end + else + got_row = true + end + end + end + end +end + +# The CSV format recommended by [RFC 4180](https://tools.ietf.org/html/rfc4180). # # * `delimiter`: `'"'` # * `separator`: `','` diff --git a/lib/csv/test_csv.nit b/lib/csv/test_csv.nit index c176a67..074fced 100644 --- a/lib/csv/test_csv.nit +++ b/lib/csv/test_csv.nit @@ -83,3 +83,120 @@ class TestCsvWriter "1,,/\r\n", "1::////#") end + +class TestCsvReader + super TestSuite + + # The custom CSV format used in the tests. + private var custom_format = new CsvFormat('/', ':', "#") + + # Expect to read `expected`. + # + # Parameters: + # + # * `skip_empty`: value of the `skip_empty` option. + # * `modal_escaping`: value of the `modal_escaping` option. + # * `input_rfc4180`: input in the RFC 4180 format. + # * `input_custom`: input in the custom CSV format. + # * `expected`: expected resulting table. + private fun expect(skip_empty: Bool, + input_rfc4180: String, + input_custom: String, + expected: SequenceRead[SequenceRead[String]]) do + var istream: IStream + var reader: CsvReader + var i = 0 + + istream = new StringIStream(input_rfc4180) + reader = new CsvReader(istream) + reader.skip_empty = skip_empty + assert_table_equals("RFC 4180", reader, expected.iterator) + + istream = new StringIStream(input_custom) + reader = new CsvReader.with_format(istream, custom_format) + reader.skip_empty = skip_empty + assert_table_equals("{custom_format.delimiter} " + + "{custom_format.separator} " + + "{custom_format.eol.escape_to_nit}", reader, expected.iterator) + end + + # Check if tables are equal. + private fun assert_table_equals(format: String, + actual: Iterator[SequenceRead[String]], + expected: Iterator[SequenceRead[String]]) do + var i = 0 + + for actual_row in actual do + assert expected.is_ok else fail(format,"Too many rows.") + var expected_row = expected.item + assert_row_equals(format, i, actual_row, expected_row) + expected.next + i += 1 + end + assert not expected.is_ok else fail(format, "Not enough rows.") + expected.finish + end + + # Check if rows are equal. + private fun assert_row_equals(format: String, + row_index: Int, + actual: SequenceRead[String], + expected: SequenceRead[String]) do + assert actual == expected else + fail(format, """ +At row {{{row_index}}}. +Expecting: {{{expected.join("|")}}} +Got: {{{actual.join("|")}}}""") + end + end + + # Output an error message with an indication of the format used. + private fun fail(format: Text, message: Text) do + sys.stderr.write "\nFormat: {format}\n" + sys.stderr.write message + sys.stderr.write "\n" + end + + fun test_empty do expect(false, "", "", new Array[Array[String]]) + + fun test_empty_eol do expect(false, "\r\n", "#", [[""]]) + + fun test_empty_skip do expect(true, "", "", new Array[Array[String]]) + + fun test_empty_skip1 do expect(true, "\r\n", "#", new Array[Array[String]]) + + fun test_empty_skip2 do expect(true, "\r\n\r\n", "##", new Array[Array[String]]) + + fun test_escaped do expect(false, "\"foo/\"\"\r\n,\"\r\n", + "/foo//\"\r\n,/#", [["foo/\"\r\n,"]]) + + fun test_unescaped do expect(false, "foo bar\r\n", + "foo bar#", [["foo bar"]]) + + fun test_escaped_no_eol do expect(false, "\"foo/\"\"\r\n,\"", + "/foo//\"\r\n,/", [["foo/\"\r\n,"]]) + + fun test_unescaped_no_eol do expect(false, "foo bar", + "foo bar", [["foo bar"]]) + + fun test_multiple_cells do expect(false, "\"1\",,\"/\"\r\n", + "/1/::////#", [["1", "", "/"]]) + + fun test_multiple_cells_unescaped do expect(false, "1,,/\r\n", + "1::////#", [["1", "", "/"]]) + + fun test_modal_escaping do expect(false, """a"b""/c","d"e""", + """/ab"///c:d/e/""", [["""ab"/c""", "de"]]) + + fun test_skip_start do expect(true, "\r\n1,,/\r\n", + "#1::////#", [["1", "", "/"]]) + + fun test_dont_skip_empty_delimited do expect(true, "\"\"\r\n", + "//#", [[""]]) + + fun test_dont_skip_multiple_empty_cells do expect(true, ",\r\n", + ":#", [["",""]]) + + fun test_mutiple_rows do expect(false, "\"a\r\nb#\",c\r\nd,\r\n,e\r\n", + "/a\r\nb#/:c#d:#:e#", [["a\r\nb#", "c"], ["d", ""], ["", "e"]]) +end -- 1.7.9.5