csv: Add a reader.
authorJean-Christophe Beaupré <jcbrinfo@users.noreply.github.com>
Tue, 9 Dec 2014 17:15:05 +0000 (12:15 -0500)
committerJean-Christophe Beaupré <jcbrinfo@users.noreply.github.com>
Tue, 9 Dec 2014 17:39:32 +0000 (12:39 -0500)
Signed-off-by: Jean-Christophe Beaupré <jcbrinfo@users.noreply.github.com>

lib/csv/csv.nit
lib/csv/test_csv.nit

index 257d565..441f95c 100644 (file)
@@ -94,6 +94,41 @@ class CsvDocument
 
        # Deprecated alias for `write_to_file`.
        fun save(file: String) do write_to_file(file)
+
+       # Load from the specified stream.
+       #
+       # Parameters:
+       #
+       # * `stream`: Input stream.
+       # * `has_header`: Is the first row the header?
+       # * `skip_empty`: Do we skip the empty lines?
+       # For details, see `CsvReader.skip_empty`.
+       fun load_from(stream: IStream, has_header: Bool, skip_empty: Bool) do
+               var reader = new CsvReader.with_format(stream, format)
+               reader.skip_empty = skip_empty
+               if has_header then
+                       if reader.is_ok then
+                               header = reader.item
+                       else
+                               header.clear
+                       end
+               end
+               records.clear
+               for record in reader do records.add(record)
+       end
+
+       # Load from the specified file.
+       #
+       # Parameters:
+       #
+       # * `path`: Path of the file.
+       # * `has_header`: Is the first row the header?
+       # * `skip_empty`: Do we skip the empty lines?
+       fun load(path: String, has_header: Bool, skip_empty: Bool) do
+               var istream = new IFStream.open(path)
+               load_from(istream, has_header, skip_empty)
+               istream.close
+       end
 end
 
 # Appends CSV rows to a file.
@@ -173,7 +208,176 @@ class CsvWriter
        end
 end
 
-# The CSV format recommended by RFC 4180.
+# Reads rows from a CSV file.
+#
+# By default, uses the format recommended by RFC 4180 (see `rfc4180`).
+#
+# ~~~nit
+# var example = new StringIStream("""
+# foo,bar\r
+# "Hello, word!",1234.5 + 42\r
+# "Something\r
+# ""else""\", baz\r
+# """)
+# var reader = new CsvReader(example)
+# var table = new Array[Array[String]]
+#
+# for row in reader do table.add row
+# assert table == [
+#                      ["foo","bar"],
+#                      ["Hello, word!","1234.5 + 42"],
+#                      ["Something\r\n\"else\""," baz"]
+#              ]
+# ~~~
+class CsvReader
+       super Iterator[Array[String]]
+
+       # The input stream.
+       var istream: IStream
+
+       # The format to use.
+       #
+       # Defaults to `rfc4180`.
+       var format: CsvFormat = rfc4180 is lazy
+
+       # Do we skip the empty lines?
+       #
+       # Note: Even if this attribute is `false`, the presence of an line ending at
+       # end of the last row does not change the number of returned rows.
+       # This is because the line endings are processed as terminators, not as
+       # separators. Therefore, when there is more than one line ending at the end
+       # of the file, the additional lines are interpreted as empty rows that
+       # are skipped only if `skip_empty` is set to `true`.
+       #
+       # `false` by default.
+       var skip_empty: Bool = false is writable
+
+       # The last read row.
+       private var row: nullable Array[String] = null
+
+       # Did we read something?
+       private var started = false
+
+       # Create a new reader with the specified format.
+       init with_format(istream:IStream, format: CsvFormat) do
+               self.istream = istream
+               self.format = format
+       end
+
+       # Read the first row, if needed.
+       fun prepare do
+               if not started then
+                       row = read_row
+                       started = true
+               end
+       end
+
+       redef fun next do
+               prepare
+               assert is_ok else
+                       sys.stderr.write "Already at the end of the stream.\n"
+               end
+               row = read_row
+       end
+
+       # Return the last read row.
+       redef fun item do
+               prepare
+               return row.as(not null)
+       end
+
+       redef fun is_ok do
+               prepare
+               return row != null
+       end
+
+       # Free some internal ressources and set `is_ok` to `false`.
+       #
+       # Do not close the input stream.
+       redef fun finish do row = null
+
+       # Close the input stream.
+       fun close do istream.close
+
+       private fun read_row: nullable Array[String] do
+               if istream.eof then return null
+               var row = new Array[String]
+               var value = new RopeBuffer
+
+               # Number of unescaped characters since the last delimiter or separator.
+               var unescaped = 0
+
+               # Do we read the start of a row?
+               var got_row = false
+
+               # Do we found a delimited string in the current cell?
+               var got_delimiter = false
+
+               loop
+                       var i = istream.read_char
+                       var c: Char
+
+                       if i < 0 then
+                               if got_row then
+                                       row.add value.to_s
+                                       return row
+                               else
+                                       return null
+                               end
+                       end
+                       c = i.ascii
+
+                       if c == format.delimiter then
+                               if got_delimiter and unescaped == 0 then
+                                       # Got an escaped delimiter.
+                                       value.add format.delimiter
+                               end
+                               # Read all bytes until the delimiter.
+                               loop
+                                       i = istream.read_char
+                                       assert not_eof: i >= 0 else
+                                               sys.stderr.write "Unexpected end of file before the end of a delimited value.\n"
+                                       end
+                                       c = i.ascii
+                                       if c == format.delimiter then break
+                                       value.add c
+                               end
+                               unescaped = 0
+                               got_row = true
+                               got_delimiter = true
+                       else if c == format.separator then
+                               # Flush the value to the row.
+                               row.add value.to_s
+                               value.clear
+                               unescaped = 0
+                               got_delimiter = false
+                       else
+                               value.add c
+                               unescaped += 1
+                               if unescaped >= format.eol.length and
+                                               value.has_suffix(format.eol) then
+                                       var value_trimed = value.substring(0,
+                                                       value.length - format.eol.length).to_s
+                                       if skip_empty and row.is_empty and
+                                                       value_trimed.is_empty and
+                                                       not got_delimiter then
+                                               # Skip the empty line.
+                                               value.clear
+                                               unescaped = 0
+                                               got_row = false
+                                       else
+                                               row.add value_trimed
+                                               return row
+                                       end
+                               else
+                                       got_row = true
+                               end
+                       end
+               end
+       end
+end
+
+# The CSV format recommended by [RFC 4180](https://tools.ietf.org/html/rfc4180).
 #
 # * `delimiter`: `'"'`
 # * `separator`: `','`
index c176a67..074fced 100644 (file)
@@ -83,3 +83,120 @@ class TestCsvWriter
                        "1,,/\r\n",
                        "1::////#")
 end
+
+class TestCsvReader
+       super TestSuite
+
+       # The custom CSV format used in the tests.
+       private var custom_format = new CsvFormat('/', ':', "#")
+
+       # Expect to read `expected`.
+       #
+       # Parameters:
+       #
+       # * `skip_empty`: value of the `skip_empty` option.
+       # * `modal_escaping`: value of the `modal_escaping` option.
+       # * `input_rfc4180`: input in the RFC 4180 format.
+       # * `input_custom`: input in the custom CSV format.
+       # * `expected`: expected resulting table.
+       private fun expect(skip_empty: Bool,
+                       input_rfc4180: String,
+                       input_custom: String,
+                       expected: SequenceRead[SequenceRead[String]]) do
+               var istream: IStream
+               var reader: CsvReader
+               var i = 0
+
+               istream = new StringIStream(input_rfc4180)
+               reader = new CsvReader(istream)
+               reader.skip_empty = skip_empty
+               assert_table_equals("RFC 4180", reader, expected.iterator)
+
+               istream = new StringIStream(input_custom)
+               reader = new CsvReader.with_format(istream, custom_format)
+               reader.skip_empty = skip_empty
+               assert_table_equals("{custom_format.delimiter} " +
+                               "{custom_format.separator} " +
+                               "{custom_format.eol.escape_to_nit}", reader, expected.iterator)
+       end
+
+       # Check if tables are equal.
+       private fun assert_table_equals(format: String,
+                       actual: Iterator[SequenceRead[String]],
+                       expected: Iterator[SequenceRead[String]]) do
+               var i = 0
+
+               for actual_row in actual do
+                       assert expected.is_ok else fail(format,"Too many rows.")
+                       var expected_row = expected.item
+                       assert_row_equals(format, i, actual_row, expected_row)
+                       expected.next
+                       i += 1
+               end
+               assert not expected.is_ok else fail(format, "Not enough rows.")
+               expected.finish
+       end
+
+       # Check if rows are equal.
+       private fun assert_row_equals(format: String,
+                       row_index: Int,
+                       actual: SequenceRead[String],
+                       expected: SequenceRead[String]) do
+               assert actual == expected else
+                       fail(format, """
+At row {{{row_index}}}.
+Expecting: {{{expected.join("|")}}}
+Got: {{{actual.join("|")}}}""")
+               end
+       end
+
+       # Output an error message with an indication of the format used.
+       private fun fail(format: Text, message: Text) do
+               sys.stderr.write "\nFormat: {format}\n"
+               sys.stderr.write message
+               sys.stderr.write "\n"
+       end
+
+       fun test_empty do expect(false, "", "", new Array[Array[String]])
+
+       fun test_empty_eol do expect(false, "\r\n", "#", [[""]])
+
+       fun test_empty_skip do expect(true, "", "", new Array[Array[String]])
+
+       fun test_empty_skip1 do expect(true, "\r\n", "#", new Array[Array[String]])
+
+       fun test_empty_skip2 do expect(true, "\r\n\r\n", "##", new Array[Array[String]])
+
+       fun test_escaped do expect(false, "\"foo/\"\"\r\n,\"\r\n",
+                       "/foo//\"\r\n,/#", [["foo/\"\r\n,"]])
+
+       fun test_unescaped do expect(false, "foo bar\r\n",
+                       "foo bar#", [["foo bar"]])
+
+       fun test_escaped_no_eol do expect(false, "\"foo/\"\"\r\n,\"",
+                       "/foo//\"\r\n,/", [["foo/\"\r\n,"]])
+
+       fun test_unescaped_no_eol do expect(false, "foo bar",
+                       "foo bar", [["foo bar"]])
+
+       fun test_multiple_cells do expect(false, "\"1\",,\"/\"\r\n",
+                       "/1/::////#", [["1", "", "/"]])
+
+       fun test_multiple_cells_unescaped do expect(false, "1,,/\r\n",
+                       "1::////#", [["1", "", "/"]])
+
+       fun test_modal_escaping do expect(false, """a"b""/c","d"e""",
+                       """/ab"///c:d/e/""", [["""ab"/c""", "de"]])
+
+       fun test_skip_start do expect(true, "\r\n1,,/\r\n",
+                       "#1::////#", [["1", "", "/"]])
+
+       fun test_dont_skip_empty_delimited do expect(true, "\"\"\r\n",
+                       "//#", [[""]])
+
+       fun test_dont_skip_multiple_empty_cells do expect(true, ",\r\n",
+                       ":#", [["",""]])
+
+       fun test_mutiple_rows do expect(false, "\"a\r\nb#\",c\r\nd,\r\n,e\r\n",
+                       "/a\r\nb#/:c#d:#:e#", [["a\r\nb#", "c"], ["d", ""], ["", "e"]])
+end